In [1]:
# This file generates the data for Figure #4 from the paper https://arxiv.org/pdf/1806.06975.pdf

In [2]:
import os 
os.chdir('..')

In [3]:
import pickle
import argparse
import networkx as nx
import pandas as pd
import numpy as np
import sklearn
import torch
import matplotlib, matplotlib.pyplot as plt
from torch.autograd import Variable

from models.ml_methods import MLMethods
import data
import data.gene_datasets
from data.graph import Graph, get_hash


ImportError: No module named optimization

In [2]:
class Object(object):
    pass

opt = Object()
opt.seed = 0
opt.nb_class = None
opt.nb_examples = None
opt.nb_nodes = None
opt.graph = "regnet"
opt.add_self = True
opt.norm_adj = True
opt.add_connectivity = False
opt.pool_graph = "ignore"
opt.samples_path = ""
opt.graph_path = ""
opt.exp_name = "regnet"
opt.trials = 3
opt.cuda = False



In [3]:
if opt.samples_path:
    data_dir = '/'.join(opt.samples_path.split('/')[:-1])
    data_file = opt.samples_path.split('/')[-1]
    dataset = data.gene_datasets.TCGATissue(data_dir=data_dir, data_file=data_file)
else:
    dataset = data.gene_datasets.TCGATissue()


Converting one-hot labels to integers


In [27]:
graph = Graph()
if opt.graph_path:
    graph.load_graph(opt.graph_path)
else:
    graph.load_graph(data.graph.get_hash(opt.graph))
nx_graph = nx.from_numpy_matrix(graph.adj)
mapping = dict(zip(range(0, len(dataset.df.columns)), dataset.df.columns))
nx_graph = nx.relabel_nodes(nx_graph, mapping)

results_file = "experiments/results/" + opt.exp_name + '/results-' + str(opt.exp_name) + '.pkl'
try:
    print "Loading Checkpointed Results"
    results = pickle.load(open(results_file, "r"))
except Exception:
    print "Creating New Results Dictionary"
    results = {"df": pd.DataFrame(columns=['auc', 'gene_name', 'model', 'num_genes', 'seed', 'train_size'])}

dataset.df = dataset.df - dataset.df.mean()

dataset_copy = dataset.df.copy(deep=True)
genes_to_iter = dataset.df.columns.difference(results['df']['gene_name'].unique())
print ("only " + str(len(genes_to_iter)) + " more genes to do...")
num_all_genes = len(dataset.df.columns)
train_size = 50
test_size = 1000


Loading Checkpointed Results
Creating New Results Dictionary
only 16300 more genes to do...


In [26]:
methods = [MLMethods(model_name="MLP", column_names=dataset.df.columns, dropout=False, cuda=opt.cuda)]
for gene in genes_to_iter:
    mean = dataset.df[gene].mean()
    dataset.labels = [1 if x > mean else 0 for x in dataset.df[gene]]
    print mean
    print sum(dataset.labels)
    try:
        neighborhood = None
        neighbors = set([gene])
        neighbors = neighbors.union(set(nx_graph.neighbors(gene)))

        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=opt.seed)
        X_train = X_train[list(neighbors)].copy()
        X_train[gene] = 1
        X_test = X_test[list(neighbors)].copy()
        X_test[gene] = 1
        neighborhood = np.asarray(nx.to_numpy_matrix(nx.Graph(nx_graph.subgraph(neighbors))))

    except Exception as e:
        pass
        #print(e)

    for method in methods: 
        for seed in range(opt.trials):
            already_done = results["df"][(results["df"].gene_name == gene) &
                                     (results["df"].model == method.model_name) &
                                     (results["df"].seed == seed) &
                                     (results["df"].train_size == train_size)].shape[0] > 0

            if already_done:
                continue
            print "doing:", gene, method.model_name, seed

            if type(neighborhood) != list:
                experiment = {
                "gene_name": gene,
                "model": method.model_name,
                "seed": seed,
                "train_size": train_size,
                "auc": ""
                }
            results["df"] = results["df"].append(experiment, ignore_index=True)
            
            method.fit(X_train, y_train)
            # If there is no model attached to this method, that means it could not fit for some reason.
            if method.model:
                x_test_var = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
                result = method.predict(x_test_var)
            else:
                result = ""
            experiment = {
                "gene_name": gene,
                "model": method.model_name,
                "seed": seed,
                "train_size": train_size,
                "auc": result
                }

            results["df"] = results["df"].append(experiment, ignore_index=True)
            results_dir = "/".join(results_file.split('/')[0:-1])

            if not os.path.isdir(results_dir):
                os.makedirs(results_dir)
            pickle.dump(results, open(results_file, "wb"))


5.9815505e-08
3955
3.4649226e-08
3483
2.2613179e-08
5397
0.0
0
-7.294574e-08
4437
0.0
0
1.4589147e-08
5060
-4.285562e-09
4811
-8.2063956e-10
5236
-5.4709304e-10
5358
-6.5651165e-09
5263
2.5531008e-08
4526
1.5318605e-08
5446
5.72624e-08
5277
2.9178295e-09
6086
-1.933062e-08
4547
1.8236435e-08
5825
3.6472867e-09
4923
-2.3707365e-08
5416
3.319031e-08
5058
2.4072094e-08
5380
1.0941861e-08
3253
1.0212403e-08
4483
3.7931784e-07
3554
-3.2460854e-08
5411
-1.1306589e-08
5274
-1.6139245e-08
5135
-1.2218411e-08
5270
3.7931784e-08
5471
-2.0424807e-08
5277
-2.9433605e-07
2701
-1.5318605e-08
3638
-5.1791474e-08
5503
-5.0879652e-08
5090
1.550097e-08
4906
-1.8236435e-08
5116
-2.1883722e-08
4073
2.5531008e-08
5506
2.5531008e-08
4695
1.677752e-08
4960
-2.4072094e-08
5151
1.5683334e-08
5070
1.9604167e-09
4934
5.1517928e-09
4845
-1.6595155e-08
4733
1.677752e-08
5265
2.0060078e-08
5058
5.8356587e-08
3935
0.0
0
0.0
0
0.0
0
0.0
0
0.0
0
-6.929845e-09
4915
5.6532947e-09
5039
-2.1701357e-08
5113
-1.969535e-08
5

doing: ADAM19 MLP 2
-2.3196743e-07
1425
doing: ADAM2 MLP 0
doing: ADAM2 MLP 1
doing: ADAM2 MLP 2
-5.1426746e-08
4986
doing: ADAM20 MLP 0
doing: ADAM20 MLP 1
doing: ADAM20 MLP 2
1.4589147e-08
5250
doing: ADAM22 MLP 0
doing: ADAM22 MLP 1
doing: ADAM22 MLP 2
2.7354652e-09
4818
doing: ADAM23 MLP 0
doing: ADAM23 MLP 1
doing: ADAM23 MLP 2
-6.5651165e-09
5552
doing: ADAM28 MLP 0
doing: ADAM28 MLP 1
doing: ADAM28 MLP 2
2.7062868e-07
3297
doing: ADAM29 MLP 0
doing: ADAM29 MLP 1
doing: ADAM29 MLP 2
1.7506977e-08
5255
doing: ADAM33 MLP 0
doing: ADAM33 MLP 1
doing: ADAM33 MLP 2
6.5651165e-09
5153
doing: ADAM8 MLP 0
doing: ADAM8 MLP 1
doing: ADAM8 MLP 2
-4.1396707e-08
5767
doing: ADAM9 MLP 0
doing: ADAM9 MLP 1
doing: ADAM9 MLP 2
-1.4589148e-09
5274
doing: ADAMTS1 MLP 0
doing: ADAMTS1 MLP 1
doing: ADAMTS1 MLP 2
1.8965892e-08
5399
doing: ADAMTS10 MLP 0
doing: ADAMTS10 MLP 1
doing: ADAMTS10 MLP 2
-7.2945734e-09
5317
doing: ADAMTS12 MLP 0
doing: ADAMTS12 MLP 1
doing: ADAMTS12 MLP 2
-8.024031e-09
5400
d

KeyboardInterrupt: 

In [24]:
dfs = {}
for ex in ['genemania', 'regnet']:
    dfs[ex] = pd.DataFrame(columns=['auc', 'gene_name', 'model', 'num_genes', 'seed', 'train_size'])
    for root, dirs, files in os.walk('experiments/results/' + ex):
        for f in files:
            if not f.startswith("slurm"):
                block = f.split('-')[-1].split('.')[0]
                graph = root.split('/')[-2]
                try:
                    dfs[ex] = dfs[ex].append(pd.DataFrame(pickle.load(open(os.path.join(root, f), 'rb'))['df']))
                except Exception:
                    import pdb; pdb.set_trace()
                    pass


> <ipython-input-24-13ce32625b74>(13)<module>()->None
-> pass
(Pdb) q


BdbQuit: 

In [None]:
agg_df = pd.DataFrame(columns=dfs[dfs.keys()[0]].columns)
for key, df in dfs.items():
    df['exp_id'] = key
    agg_df = pd.concat([agg_df, dfs[key]])


In [8]:
def create_line(df1):
    small_neighborhood_mlp_df = df1[df1['num_genes'] == 50].groupby(['gene_name', 'model','train_size'])['auc'].agg(['mean', 'std'])
    big_neighborhood_mlp_df = df1[df1['num_genes'] == 16300].groupby(['gene_name', 'model','train_size'])['auc'].agg(['mean', 'std'])
    mlp_low_var_high_mean_df = small_neighborhood_mlp_df.sub(big_neighborhood_mlp_df).sort_values('mean', ascending=False)
    return mlp_low_var_high_mean_df

std = 1.0
slice_size = 100000
lines = []
for exp_id in agg_df.keys():
    df = agg_df[agg_df['exp_id']==exp_id].dropna()
    lines[(create_line(df)['mean'])
    
fig, ax = plt.subplots()
n1, bins1, patches1 = ax.hist(l1, range=(-.4, .25), bins=100, label="Regnet", density=0, alpha=0.55, histtype='step')
n1, bins1, patches1 = ax.hist(l2, range=(-.4, .25), bins=100, label="GeneMania", density=0, alpha=0.55, histtype='step')

plt.title("First Degree Neighbors vs Full Gene Set")
plt.ylabel("Count")
plt.xlabel("% AUC Improvement")

plt.legend()
plt.show()

SyntaxError: invalid syntax (<ipython-input-8-d2a0ebfac5bc>, line 14)