In [1]:
import os
import time
import pickle
import numpy as np
import matplotlib, matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import torch
import itertools
from torch.autograd import Variable
import sklearn, sklearn.model_selection, sklearn.metrics
import numpy as np

from models.model_wrapper import MLP, GCN, SLR
from data import datasets
from data.gene_graphs import GeneManiaGraph
from data.utils import record_result

torch.cuda.set_device(1)

In [2]:
dataset = datasets.TCGADataset()
dataset.df = dataset.df - dataset.df.mean(axis=0)

Torrent name: TCGA_tissue_ppi.hdf5, Size: 1748.32MB
Checking for pieces on disk: |██████████████████████████████████████████████████| 100.0% 
Found 1668 finished pieces out of 1668 total pieces.
Converting one-hot labels to integers


In [9]:
# Setup the results dictionary
filename = "experiments/results/fig-5.pkl"
try:
    results = pickle.load(open(filename, "rb"), encoding='latin1')
    print("Loaded Checkpointed Results")
except Exception as e:
    results = pd.DataFrame(columns=['auc', 'gene', 'model', 'num_genes', 'seed', 'train_size'])
    print("Created a New Results Dictionary")


Loaded Checkpointed Results


In [4]:
gene_graph = GeneManiaGraph()

Torrent name: genemania.pkl, Size: 9.61MB


In [5]:
search_num_genes=[50, 100, 200, 300, 500, 1000, 2000, 4000, 8000, 16000]
test_size=1000
search_train_size=[50]
cuda=True
trials=5
search_genes = ["RPL4", "RPL5", "RPS10", "RPS3", "CEBPD", "IL5", "PABPC3", "PSMB10", "S100A8", "S100A9", "TOP1", "C15orf40", "RNF138", "DLGAP2", "EVI2B", "ZFP82", "MYBL2", "PSMB1", "CISD1", "HLA-B", "SAA2", "IFIT1", "RPS3A", "TP53", "TNF", "EGFR"]
model_list = [           
              GCN(name="GCN_lay20_chan32_emb32_dropout_pool", cuda=cuda, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="hierarchy"),
              GCN(name="GCN_lay3_chan64_emb32_dropout", cuda=cuda, num_layer=3, channels=64, embedding=32),
              MLP(name="MLP_lay2_chan512_dropout", cuda=cuda, dropout=True, num_layer=2, channels=512),
              MLP(name="MLP_lay2_chan512", cuda=cuda, dropout=False, num_layer=2, channels=512),
              SLR(name="SLR_lambda1_l11", cuda=cuda)
             ]

In [10]:
# Create the set of all experiment ids and see which are left to do
model_names = [model.name for model in model_list]
columns = ["gene", "model", "num_genes", "train_size", "seed"]
all_exp_ids = [x for x in itertools.product(search_genes, model_names, search_num_genes, search_train_size, range(trials))]
all_exp_ids = pd.DataFrame(all_exp_ids, columns=columns)
all_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in all_exp_ids.itertuples(name=None)]
results_exp_ids = results[columns].copy()
results_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in results_exp_ids.itertuples(name=None)]
intersection_ids = all_exp_ids.index.intersection(results_exp_ids.index)
todo = all_exp_ids.drop(intersection_ids).to_dict(orient="records")
print("todo: " + str(len(todo)))
print("done: " + str(len(results)))

todo: 6457
done: 45


In [11]:
def get_every_n(a, n=2):
    for i in range(a.shape[0] // 2):
        yield a[2*i:2*(i+1)]


In [None]:
for row in todo:
    print(row)
    start_time = time.time()
    gene = row["gene"]
    model_name = row["model"]
    seed = row["seed"]
    num_genes = row["num_genes"]
    train_size = row["train_size"]

    model = [x for x in model_list if x.name == model_name][0]

    experiment = {
        "gene": gene,
        "model": model.name,
        "num_genes": num_genes,
        "train_size": train_size,
        "seed": seed,
    }

    dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int")
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=seed)

    neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

    X_train = X_train[list(neighbors.nodes)].copy()
    X_test = X_test[list(neighbors.nodes)].copy()
    X_train[gene] = 1
    X_test[gene] = 1
    adj = np.asarray(nx.to_numpy_matrix(neighbors))
    model.fit(X_train, y_train, adj=adj)
    
    x_test = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
    if cuda:
        x_test = x_test.cuda()
    
    y_hat = []
    for chunk in get_every_n(x_test, 10):
        y_hat.extend(model.predict(chunk)[:,1].data.cpu().numpy().tolist())
    auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten())

    del model
    experiment["auc"] = auc
    results = record_result(results, experiment, filename)
    print("time elapsed for genes: " +str(num_genes) + " : " + str(time.time() - start_time))


{'train_size': 50, 'seed': 3, 'gene': 'RPL4', 'num_genes': 8000, 'model': 'GCN_lay20_chan32_emb32_dropout_pool'}
epoch: 0, {'train': 0.49373433583959897, 'valid': 0.6799999999999999}
epoch: 1, {'train': 0.48621553884711777, 'valid': 0.39999999999999997}
epoch: 2, {'train': 0.7794486215538847, 'valid': 0.6000000000000001}
epoch: 3, {'train': 0.7067669172932332, 'valid': 0.6000000000000001}
epoch: 4, {'train': 0.4536340852130326, 'valid': 0.32}
epoch: 5, {'train': 0.7644110275689223, 'valid': 0.52}
epoch: 6, {'train': 0.7794486215538847, 'valid': 0.6}
epoch: 7, {'train': 0.8095238095238094, 'valid': 0.52}
epoch: 8, {'train': 0.8421052631578948, 'valid': 0.56}
epoch: 9, {'train': 0.8696741854636592, 'valid': 0.52}
epoch: 10, {'train': 0.8771929824561403, 'valid': 0.56}
epoch: 11, {'train': 0.899749373433584, 'valid': 0.56}
epoch: 12, {'train': 0.9147869674185464, 'valid': 0.56}
epoch: 13, {'train': 0.9197994987468672, 'valid': 0.56}
epoch: 14, {'train': 0.9273182957393483, 'valid': 0.52}


(Pdb)  l


106  	                processed_path = "/tmp/" + '{}.npy'.format(adj_hash)
107  	                if os.path.isfile(processed_path):
108  	                    ids = np.load(processed_path)
109  	                else:
110  	                    import pdb; pdb.set_trace()
111  ->	                    ids = sklearn.cluster.AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',
112  	                                                                         memory='/tmp', connectivity=(current_adj > 0.).astype(bool),
113  	                                                                         compute_full_tree='auto', linkage='ward').fit_predict(adj.astype("float32"))
114  	                    np.save(processed_path, np.array(ids))
115  	            n_clusters = len(set(ids))
116  	            clusters = set([])


In [7]:
row = todo[0]
print(row)
start_time = time.time()
gene = row["gene"]
model_name = row["model"]
seed = row["seed"]
num_genes = 2000
train_size = row["train_size"]

model = [x for x in model_list if x.name == model_name][0]

experiment = {
    "gene": gene,
    "model": model.name,
    "num_genes": num_genes,
    "train_size": train_size,
    "seed": seed,
}

dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int")
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=seed)

neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

X_train = X_train[list(neighbors.nodes)].copy()
X_test = X_test[list(neighbors.nodes)].copy()
X_train[gene] = 1
X_test[gene] = 1
adj = np.asarray(nx.to_numpy_matrix(neighbors))


{'model': 'GCN_lay1_chan64_emb32_dropout_agg', 'seed': 0, 'num_genes': 4000, 'gene': 'IL5', 'train_size': 50}


In [8]:
x = Variable(torch.FloatTensor(np.expand_dims(X_train[:10].values, axis=2)), requires_grad=False).float().cuda()
adj = Variable(torch.FloatTensor(adj), requires_grad=False).cuda()

In [9]:
from models.model_wrapper import EmbeddingLayer
emb = EmbeddingLayer(num_genes, 32).cuda()
x = emb(x)


In [10]:
x = x.permute(0, 2, 1).contiguous() 


In [11]:
x = x.view(-1, x.size(-1))

In [30]:
temp = []
x_cpu = x.view(-1, x.size(-1)).cpu()
print("starting")
start = time.time()
for row in adj:
    neighbors = row.nonzero().view(-1)
    temp.append(x[:, neighbors].max(dim=1)[0])
max_value = torch.stack(temp).cuda()
print(time.time() - start)
max_value

starting
0.1645040512084961


tensor([[0.0490, 0.0281, 0.0000,  ..., 0.4717, 0.0000, 0.4486],
        [0.0858, 0.0776, 0.0931,  ..., 0.0397, 0.0356, 0.1249],
        [0.1027, 0.6095, 0.3313,  ..., 0.2056, 0.3080, 0.3055],
        ...,
        [0.2954, 0.3283, 0.3313,  ..., 0.1148, 0.6859, 0.8953],
        [0.3835, 0.3283, 0.6763,  ..., 0.1024, 0.2916, 0.2102],
        [0.5069, 0.3283, 0.6460,  ..., 0.1507, 0.1456, 0.1327]],
       device='cuda:0', grad_fn=<StackBackward>)

In [51]:
(x.view(-1, x.size(-1), 1).cpu()[0] * adj.cpu()).max(dim=1)

(tensor([0.0386, 0.0220, 0.0000,  ..., 0.0022, 0.0068, 0.0029],
        grad_fn=<MaxBackward0>),
 tensor([   1,    0, 1999,  ..., 1864,  115, 1974]))

In [31]:
(x.view(-1, x.size(-1), 1).cpu() * adj.cpu()).max(dim=1)[0]

tensor([[0.0220, 0.0386, 0.0257,  ..., 0.0086, 0.0097, 0.0161],
        [0.0045, 0.0056, 0.0488,  ..., 0.0094, 0.0084, 0.0887],
        [-0.0000, 0.0419, 0.0206,  ..., 0.0072, 0.0081, 0.0266],
        ...,
        [0.2123, 0.0068, 0.0123,  ..., 0.0085, 0.0006, 0.0115],
        [-0.0000, 0.0016, 0.0176,  ..., 0.0108, 0.0222, 0.0080],
        [0.2019, 0.0562, 0.0262,  ..., 0.0058, 0.0160, 0.0088]],
       grad_fn=<MaxBackward0>)

In [39]:
x.view(-1, x.size(-1), 1).cpu().shape

torch.Size([320, 2000, 1])

In [64]:
adj.shape

torch.Size([16000, 16000])

In [65]:
adj

tensor([[0.0000, 0.4500, 0.2100,  ..., 0.0000, 0.0000, 0.0000],
        [0.4500, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2100, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')

In [None]:

for plot_gene in search_genes:

    %matplotlib inline
    plt.rcParams['figure.figsize'] = (7.5, 3.6)
    plot_train_size = 50

    subset = results[(results.train_size==plot_train_size) & 
                      (results.gene==plot_gene) & 
                      (results.num_genes!=400) &      
                      (results.num_genes> 0)]


    q = subset.groupby(['model','num_genes'])['auc']

    todo = list(subset["model"].unique())
    linestyles = ['-', '-', '--', '-.', ':']
    for ls, model in enumerate(sorted(todo)):
        index = list(q.mean()[model].index)
        mean = q.mean()[model]
        stderr = q.std()[model]/np.sqrt(q.count()[model])
        displayname = model.replace("CGN","GCN")
        displayname = displayname.replace("SLR", "SNLR")
        plt.errorbar(index, mean,label=displayname, xerr=0, yerr=stderr, ls=linestyles[ls])

    plt.title("Gene Inference " + plot_gene + " (train_size:" + str(plot_train_size) +")")
    plt.ylabel("AUC")
    plt.xlabel("Number of genes")
    plt.xscale("log")
    plt.xticks(sorted(subset["num_genes"].unique()))
    formatter = matplotlib.ticker.ScalarFormatter()
    plt.gca().xaxis.set_major_formatter(formatter)

    plt.legend();
    fd = len(list(gene_graph.nx_graph.neighbors(plot_gene)))
    print fd
    if fd > 50:
        plt.axvline(fd, ymin=0.4, ymax=1.0, c="black")
        c = plt.ylim()
        plt.text(fd*1.05,c[1]-((c[1]-c[0])*0.2),'First Degree',rotation=90)


    plt.savefig("experiments/results/sgi-" + plot_gene + "-" + "train" + str(plot_train_size) + ".png", bbox_inches='tight')
    plt.show()