In [5]:
import os, time, pickle
import pandas as pd
import numpy as np

from arboreto.algo import grnboost2, genie3
from libs.network_analysis import evaluation

We will use the top k edges, where k is the number of the edges in the ground truth network (excluding self-loops).

Run 2 datasets and then restart kernel, next 2 datasets and so on. 

In [3]:
syn_datasets = {'dyn-BF', 'dyn-BFC'}#, 'dyn-CY', 'dyn-LI', 'dyn-LL', 'dyn-TF'}

In [4]:
for dataset in syn_datasets:
    
    base_path = './beeline_data/synthetic/' + dataset + '/'

    data = {}
    refs = {}
    for folder in os.listdir(base_path):
        temp_path = os.path.join(base_path, folder)
        if os.path.isdir(temp_path):
            gem = pd.read_csv(temp_path + '/ExpressionData.csv', index_col = 0) #read gem
            pseudotime = pd.read_csv(temp_path + '/PseudoTime.csv', index_col = 0) #is distributed in columns, we need to make it one
            pseudotime_all = pd.DataFrame({'pseudotime': pseudotime.fillna(0).sum(axis=1)}) #make one pseudotime
            gem = gem.T #transpose
            gem['pseudotime'] = pseudotime_all.values #add pseudotime to gem
        
            data[folder] = gem.sort_values(by='pseudotime').drop('pseudotime', axis=1) #sort gem according to pseudotime
            refs[folder] = pd.read_csv(temp_path + '/refNetwork.csv') #ground truth

    #run the models
    for fn in [genie3, grnboost2]:
        
        results_path = './Results_arboreto/' + fn.__name__ + '/'
        
        links = {}
        times=[]
        for folder in data:
            start = time.time()
            links[folder] = fn(expression_data=data[folder])
            end = time.time()
            times.append(end-start)
    
        #save links
        with open(results_path + 'links/' + dataset + '_links.pickle', 'wb') as handle:
            pickle.dump(links, handle, protocol=pickle.HIGHEST_PROTOCOL)

        #metrics dataframe
        metrics_df = pd.DataFrame()

        for el in links:
            tfs = data[el].columns
            genes = tfs.copy()
            num_links = np.where(refs[el]['Gene1'] == refs[el]['Gene2'], 0, 1).sum()
            metrics_df = metrics_df.append(evaluation(links[el][:num_links], refs[el], tfs, genes))

        metrics_df.index = links.keys()
        metrics_df['time'] = np.round(times,2)
        metrics_df.to_csv(results_path + dataset + '.csv')

distributed.diskutils - INFO - Found stale lock file and directory '/home/lorena/Documents/Lorena_Thesis/Causal_CNN/dask-worker-space/worker-t2nroizr', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/lorena/Documents/Lorena_Thesis/Causal_CNN/dask-worker-space/worker-ehr8zkik', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/lorena/Documents/Lorena_Thesis/Causal_CNN/dask-worker-space/worker-n_c9y6w5', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/lorena/Documents/Lorena_Thesis/Causal_CNN/dask-worker-space/worker-fm1air49', purging
Process Dask Worker process (from Nanny):
Process Dask Worker process (from Nanny):
Traceback (most recent call last):
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/site-packages/distributed/nanny.py", line 897, in _run
    loop.run_sync(run)
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/site-packages/tornado/ioloop.py", lin

KeyboardInterrupt: 

rt
    self.asyncio_loop.run_forever()
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/asyncio/base_events.py", line 596, in run_forever
    self._run_once()
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once
    handle._run()
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/site-packages/distributed/nanny.py", line 820, in do_stop
    await worker.close(
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/site-packages/distributed/worker.py", line 1312, in close
    executor.shutdown(wait=executor_wait, timeout=timeout)
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/python3.9/site-packages/distributed/threadpoolexecutor.py", line 105, in shutdown
    t.join(timeout=timeout2)
  File "/home/lorena/anaconda3/envs/arboreto_env/lib/pyth

In [None]:
cur_datasets = {'GSD', 'HSC', 'mCAD','VSC'}
edges_datasets = {'GSD':83, 'HSC':26, 'mCAD':13, 'VSC':15}

In [None]:
for dataset in cur_datasets:
    
    base_path = './beeline_data/curated/' + dataset + '/'

    data = {}
    refs = {}
    for folder in os.listdir(base_path):
        temp_path = os.path.join(base_path, folder)
        if os.path.isdir(temp_path):
            gem = pd.read_csv(temp_path + '/ExpressionData.csv', index_col = 0) #read gem
            pseudotime = pd.read_csv(temp_path + '/PseudoTime.csv', index_col = 0) #is distributed in columns, we need to make it one
            pseudotime_all = pd.DataFrame({'pseudotime': pseudotime.fillna(0).sum(axis=1)}) #make one pseudotime
            gem = gem.T #transpose
            gem['pseudotime'] = pseudotime_all.values #add pseudotime to gem
        
            data[folder] = gem.sort_values(by='pseudotime').drop('pseudotime', axis=1) #sort gem according to pseudotime
            refs[folder] = pd.read_csv(temp_path + '/refNetwork.csv') #ground truth

    #run the models
    for fn in [genie3, grnboost2]:
        
        results_path = './Results_arboreto/' + fn.__name__ + '/'
        
        links = {}
        times=[]
        for folder in data:
            start = time.time()
            links[folder] = fn(expression_data=data[folder])
            end = time.time()
            times.append(end-start)
    
        #save links
        with open(results_path + 'links/' + dataset + '_links.pickle', 'wb') as handle:
            pickle.dump(links, handle, protocol=pickle.HIGHEST_PROTOCOL)

        #metrics dataframe
        metrics_df = pd.DataFrame()

        for el in links:
            tfs = data[el].columns
            genes = tfs.copy()
            num_links = edges_datasets[dataset]
            metrics_df = metrics_df.append(evaluation(links[el][:num_links], refs[el], tfs, genes))

        metrics_df.index = links.keys()
        metrics_df['time'] = np.round(times,2)
        metrics_df.to_csv(results_path + dataset + '.csv')