In [3]:
# This file generates the data for Figure #4 from the paper https://arxiv.org/pdf/1806.06975.pdf

In [4]:
import os 
os.chdir('..')

In [5]:
import pickle
import argparse
import networkx as nx
import pandas as pd
import numpy as np
import sklearn
import torch
import matplotlib, matplotlib.pyplot as plt
from torch.autograd import Variable

from models.ml_methods import MLMethods
import data
import data.gene_datasets
from data.graph import Graph, get_hash


  from ._conv import register_converters as _register_converters


In [6]:
class Object(object):
    pass

opt = Object()
opt.seed = 0
opt.nb_class = None
opt.nb_examples = None
opt.nb_nodes = None
opt.graph = "regnet"
opt.add_self = True
opt.norm_adj = True
opt.add_connectivity = False
opt.pool_graph = "ignore"
opt.samples_path = ""
opt.graph_path = ""
opt.exp_name = "regnet"
opt.trials = 3
opt.cuda = False



In [7]:
if opt.samples_path:
    data_dir = '/'.join(opt.samples_path.split('/')[:-1])
    data_file = opt.samples_path.split('/')[-1]
    dataset = data.gene_datasets.TCGATissue(data_dir=data_dir, data_file=data_file)
else:
    dataset = data.gene_datasets.TCGATissue()


Converting one-hot labels to integers


In [9]:
methods = [MLMethods(model_name="MLP", column_names=dataset.df.columns, dropout=False, cuda=opt.cuda)]
for gene in genes_to_iter:
    mean = dataset.df[gene].mean()
    dataset.labels = [1 if x > mean else 0 for x in dataset.df[gene]]
    print mean
    print sum(dataset.labels)
    try:
        neighborhood = None
        neighbors = set([gene])
        neighbors = neighbors.union(set(nx_graph.neighbors(gene)))

        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=opt.seed)
        X_train = X_train[list(neighbors)].copy()
        X_train[gene] = 1
        X_test = X_test[list(neighbors)].copy()
        X_test[gene] = 1
        neighborhood = np.asarray(nx.to_numpy_matrix(nx.Graph(nx_graph.subgraph(neighbors))))

    except Exception as e:
        pass
        #print(e)

    for method in methods: 
        for seed in range(opt.trials):
            already_done = results["df"][(results["df"].gene_name == gene) &
                                     (results["df"].model == method.model_name) &
                                     (results["df"].seed == seed) &
                                     (results["df"].train_size == train_size)].shape[0] > 0

            if already_done:
                continue
            print "doing:", gene, method.model_name, seed

            if type(neighborhood) != list:
                experiment = {
                "gene_name": gene,
                "model": method.model_name,
                "seed": seed,
                "train_size": train_size,
                "auc": ""
                }
            results["df"] = results["df"].append(experiment, ignore_index=True)
            
            method.fit(X_train, y_train)
            # If there is no model attached to this method, that means it could not fit for some reason.
            if method.model:
                x_test_var = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
                result = method.predict(x_test_var)
            else:
                result = ""
            experiment = {
                "gene_name": gene,
                "model": method.model_name,
                "seed": seed,
                "train_size": train_size,
                "auc": result
                }

            results["df"] = results["df"].append(experiment, ignore_index=True)
            results_dir = "/".join(results_file.split('/')[0:-1])

            if not os.path.isdir(results_dir):
                os.makedirs(results_dir)
            pickle.dump(results, open(results_file, "wb"))


1.4589147e-08
4627
doing: A1BG MLP 0
doing: A1BG MLP 1
doing: A1BG MLP 2
-7.6884805e-07
2545
doing: A1CF MLP 0
doing: A1CF MLP 1
doing: A1CF MLP 2
-7.1486824e-08
5293
doing: A2M MLP 0
doing: A2M MLP 1
doing: A2M MLP 2
5.9815505e-08
3955
doing: A2ML1 MLP 0
doing: A2ML1 MLP 1
doing: A2ML1 MLP 2
-4.9238373e-09
5441
doing: A4GALT MLP 0
doing: A4GALT MLP 1
doing: A4GALT MLP 2
3.4649226e-08
3483
doing: A4GNT MLP 0
doing: A4GNT MLP 1
doing: A4GNT MLP 2
-1.3814099e-08
4932
doing: AAAS MLP 0
doing: AAAS MLP 1
doing: AAAS MLP 2
2.2613179e-08
5397
doing: AACS MLP 0
doing: AACS MLP 1
doing: AACS MLP 2
0.0
0
doing: AAED1 MLP 0
Only one class represented.


RuntimeError: size mismatch, m1: [1000 x 141], m2: [98 x 16] at /Users/soumith/code/builder/wheel/pytorch-src/aten/src/TH/generic/THTensorMath.c:2033

In [24]:
dfs = {}
for ex in ['genemania', 'regnet']:
    dfs[ex] = pd.DataFrame(columns=['auc', 'gene_name', 'model', 'num_genes', 'seed', 'train_size'])
    for root, dirs, files in os.walk('experiments/results/' + ex):
        for f in files:
            if not f.startswith("slurm"):
                block = f.split('-')[-1].split('.')[0]
                graph = root.split('/')[-2]
                try:
                    dfs[ex] = dfs[ex].append(pd.DataFrame(pickle.load(open(os.path.join(root, f), 'rb'))['df']))
                except Exception:
                    import pdb; pdb.set_trace()
                    pass


> <ipython-input-24-13ce32625b74>(13)<module>()->None
-> pass
(Pdb) q


BdbQuit: 

In [None]:
agg_df = pd.DataFrame(columns=dfs[dfs.keys()[0]].columns)
for key, df in dfs.items():
    df['exp_id'] = key
    agg_df = pd.concat([agg_df, dfs[key]])


In [8]:
def create_line(df1):
    small_neighborhood_mlp_df = df1[df1['num_genes'] == 50].groupby(['gene_name', 'model','train_size'])['auc'].agg(['mean', 'std'])
    big_neighborhood_mlp_df = df1[df1['num_genes'] == 16300].groupby(['gene_name', 'model','train_size'])['auc'].agg(['mean', 'std'])
    mlp_low_var_high_mean_df = small_neighborhood_mlp_df.sub(big_neighborhood_mlp_df).sort_values('mean', ascending=False)
    return mlp_low_var_high_mean_df

std = 1.0
slice_size = 100000
lines = []
for exp_id in agg_df.keys():
    df = agg_df[agg_df['exp_id']==exp_id].dropna()
    lines[(create_line(df)['mean'])
    
fig, ax = plt.subplots()
n1, bins1, patches1 = ax.hist(l1, range=(-.4, .25), bins=100, label="Regnet", density=0, alpha=0.55, histtype='step')
n1, bins1, patches1 = ax.hist(l2, range=(-.4, .25), bins=100, label="GeneMania", density=0, alpha=0.55, histtype='step')

plt.title("First Degree Neighbors vs Full Gene Set")
plt.ylabel("Count")
plt.xlabel("% AUC Improvement")

plt.legend()
plt.show()

SyntaxError: invalid syntax (<ipython-input-8-d2a0ebfac5bc>, line 14)