In [13]:
import os
import time
import pickle
import numpy as np
import matplotlib, matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import torch
import itertools
from torch.autograd import Variable
import sklearn, sklearn.model_selection, sklearn.metrics
import numpy as np

from models.model_wrapper import MLP, GCN, SLR
from data import datasets
from data.gene_graphs import GeneManiaGraph
from data.utils import record_result


In [2]:
dataset = datasets.TCGADataset()
dataset.df = dataset.df - dataset.df.mean(axis=0)

Torrent name: TCGA_tissue_ppi.hdf5, Size: 1748.32MB
Converting one-hot labels to integers


In [14]:
# Setup the results dictionary
filename = "experiments/results/fig-5.pkl"
try:
    results = pickle.load(open(filename, "rb"), encoding='latin1')
    print("Loaded Checkpointed Results")
except Exception as e:
    results = pd.DataFrame(columns=['auc', 'gene', 'model', 'num_genes', 'seed', 'train_size', 'time_elapsed'])
    print("Created a New Results Dictionary")


Loaded Checkpointed Results


In [15]:
results

Unnamed: 0,auc,gene,model,num_genes,seed,train_size,time_elapsed
0,0.655529,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,50,0,50,6.756546497344971
1,0.535226,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,50,1,50,3.6235568523406982
2,0.630130,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,50,2,50,3.5820469856262207
3,0.627123,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,50,3,50,4.080587387084961
4,0.491595,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,50,4,50,3.828230619430542
5,0.878007,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,100,0,50,3.989731550216675
6,0.822578,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,100,1,50,4.060584545135498
7,0.841581,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,100,2,50,3.936028480529785
8,0.873257,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,100,3,50,4.254684686660767
9,0.835952,RPL4,GCN_lay20_chan32_emb32_dropout_pool_hierarchy,100,4,50,3.747222661972046


In [12]:
gene_graph = GeneManiaGraph()

Torrent name: genemania.pkl, Size: 9.61MB


In [5]:
search_num_genes=[50, 100, 200, 300, 500, 1000, 2000, 4000, 8000, 16000]
test_size=1000
search_train_size=[50]
cuda=True
trials=5
search_genes = ["RPL4", "RPL5", "RPS10", "RPS3", "CEBPD", "IL5", "PABPC3", "PSMB10", "S100A8", "S100A9", "TOP1", "C15orf40", "RNF138", "DLGAP2", "EVI2B", "ZFP82", "MYBL2", "PSMB1", "CISD1", "HLA-B", "SAA2", "IFIT1", "RPS3A", "TP53", "TNF", "EGFR"]
model_list = [       
              GCN(name="GCN_lay20_chan32_emb32_dropout_pool_hierarchy", cuda=cuda, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="hierarchy"),
              #GCN(name="GCN_lay20_chan32_emb32_dropout_pool_random", cuda=cuda, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="random"),
              GCN(name="GCN_lay3_chan64_emb32_dropout", cuda=cuda, num_layer=3, channels=64, embedding=32),
              MLP(name="MLP_lay2_chan512_dropout", cuda=cuda, dropout=True, num_layer=2, channels=512),
              MLP(name="MLP_lay2_chan512", cuda=cuda, dropout=False, num_layer=2, channels=512),
              SLR(name="SLR_lambda1_l11", cuda=cuda)
             ]

In [6]:
# Create the set of all experiment ids and see which are left to do
model_names = [model.name for model in model_list]
columns = ["gene", "model", "num_genes", "train_size", "seed"]
all_exp_ids = [x for x in itertools.product(search_genes, model_names, search_num_genes, search_train_size, range(trials))]
all_exp_ids = pd.DataFrame(all_exp_ids, columns=columns)
all_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in all_exp_ids.itertuples(name=None)]
results_exp_ids = results[columns].copy()
results_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in results_exp_ids.itertuples(name=None)]
intersection_ids = all_exp_ids.index.intersection(results_exp_ids.index)
todo = all_exp_ids.drop(intersection_ids).to_dict(orient="records")
print("todo: " + str(len(todo)))
print("done: " + str(len(results)))

todo: 6455
done: 49


In [7]:
def get_every_n(a, n=2):
    for i in range(a.shape[0] // 2):
        yield a[2*i:2*(i+1)]


In [None]:
for row in todo:
    print(row)
    start_time = time.time()
    gene = row["gene"]
    model_name = row["model"]
    seed = row["seed"]
    num_genes = row["num_genes"] if row["num_genes"] < 10000 else 16300
    train_size = row["train_size"]

    model = [x for x in model_list if x.name == model_name][0]

    experiment = {
        "gene": gene,
        "model": model.name,
        "num_genes": num_genes,
        "train_size": train_size,
        "seed": seed,
    }

    dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int")
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=seed)
    
    if num_genes == 16300:
        neighbors = gene_graph.nx_graph
    else:
        neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

    X_train = X_train[list(neighbors.nodes)].copy()
    X_test = X_test[list(neighbors.nodes)].copy()
    X_train[gene] = 1
    X_test[gene] = 1
    adj = np.asarray(nx.to_numpy_matrix(neighbors))
    model.fit(X_train, y_train, adj=adj)
    
    x_test = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
    if cuda:
        x_test = x_test.cuda()
    
    y_hat = []
    for chunk in get_every_n(x_test, 10):
        y_hat.extend(model.predict(chunk)[:,1].data.cpu().numpy().tolist())
    auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten())

    del model
    experiment["auc"] = auc
    experiment["time_elapsed"] = str(time.time() - start_time)
    results = record_result(results, experiment, filename)
    print("time elapsed for genes: " +str(num_genes) + " : " + str(time.time() - start_time))


{'seed': 0, 'train_size': 50, 'num_genes': 16000, 'model': 'GCN_lay20_chan32_emb32_dropout_pool_hierarchy', 'gene': 'RPL4'}
{'train': 0.32832080200501257, 'valid': 0.28}
{'train': 0.7142857142857143, 'valid': 0.92}
{'train': 0.7192982456140351, 'valid': 0.92}
{'train': 0.46616541353383456, 'valid': 0.28}
{'train': 0.7142857142857142, 'valid': 0.9600000000000001}
{'train': 0.7167919799498748, 'valid': 0.56}
{'train': 0.6992481203007519, 'valid': 0.56}
{'train': 0.8822055137844612, 'valid': 1.0}
{'train': 0.8045112781954887, 'valid': 0.72}
{'train': 0.8696741854636592, 'valid': 0.76}
{'train': 0.8771929824561404, 'valid': 0.9600000000000001}
{'train': 0.8596491228070176, 'valid': 0.76}
{'train': 0.8746867167919801, 'valid': 0.88}
{'train': 0.8746867167919801, 'valid': 0.88}
{'train': 0.8771929824561404, 'valid': 0.76}
{'train': 0.8721804511278196, 'valid': 0.88}
{'train': 0.8746867167919801, 'valid': 0.88}
{'train': 0.8771929824561404, 'valid': 0.88}
time elapsed for genes: 16300 : 135.3

In [7]:
row = todo[0]
print(row)
start_time = time.time()
gene = row["gene"]
model_name = row["model"]
seed = row["seed"]
num_genes = 2000
train_size = row["train_size"]

model = [x for x in model_list if x.name == model_name][0]

experiment = {
    "gene": gene,
    "model": model.name,
    "num_genes": num_genes,
    "train_size": train_size,
    "seed": seed,
}

dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int")
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=seed)

neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

X_train = X_train[list(neighbors.nodes)].copy()
X_test = X_test[list(neighbors.nodes)].copy()
X_train[gene] = 1
X_test[gene] = 1
adj = np.asarray(nx.to_numpy_matrix(neighbors))


{'model': 'GCN_lay1_chan64_emb32_dropout_agg', 'seed': 0, 'num_genes': 4000, 'gene': 'IL5', 'train_size': 50}


In [None]:

for plot_gene in search_genes:

    %matplotlib inline
    plt.rcParams['figure.figsize'] = (7.5, 3.6)
    plot_train_size = 50

    subset = results[(results.train_size==plot_train_size) & 
                      (results.gene==plot_gene) & 
                      (results.num_genes!=400) &      
                      (results.num_genes> 0)]


    q = subset.groupby(['model','num_genes'])['auc']

    todo = list(subset["model"].unique())
    linestyles = ['-', '-', '--', '-.', ':']
    for ls, model in enumerate(sorted(todo)):
        index = list(q.mean()[model].index)
        mean = q.mean()[model]
        stderr = q.std()[model]/np.sqrt(q.count()[model])
        displayname = model.replace("CGN","GCN")
        displayname = displayname.replace("SLR", "SNLR")
        plt.errorbar(index, mean,label=displayname, xerr=0, yerr=stderr, ls=linestyles[ls])

    plt.title("Gene Inference " + plot_gene + " (train_size:" + str(plot_train_size) +")")
    plt.ylabel("AUC")
    plt.xlabel("Number of genes")
    plt.xscale("log")
    plt.xticks(sorted(subset["num_genes"].unique()))
    formatter = matplotlib.ticker.ScalarFormatter()
    plt.gca().xaxis.set_major_formatter(formatter)

    plt.legend();
    fd = len(list(gene_graph.nx_graph.neighbors(plot_gene)))
    print fd
    if fd > 50:
        plt.axvline(fd, ymin=0.4, ymax=1.0, c="black")
        c = plt.ylim()
        plt.text(fd*1.05,c[1]-((c[1]-c[0])*0.2),'First Degree',rotation=90)


    plt.savefig("experiments/results/sgi-" + plot_gene + "-" + "train" + str(plot_train_size) + ".png", bbox_inches='tight')
    plt.show()