In [1]:
import os
import time
import pickle
import numpy as np
import matplotlib, matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import torch
import itertools
from torch.autograd import Variable
import sklearn, sklearn.model_selection, sklearn.metrics
import numpy as np
from scipy import sparse
from models.model_wrapper import MLP, GCN, SLR
from data import datasets
from data.gene_graphs import GeneManiaGraph
from data.utils import record_result

In [2]:
dataset = datasets.TCGADataset()
dataset.df = dataset.df - dataset.df.mean(axis=0)

Torrent name: TCGA_tissue_ppi.hdf5, Size: 1748.32MB
Converting one-hot labels to integers


In [3]:
# Setup the results dictionary
filename = "experiments/results/fig-5-sparse.pkl"
try:
    results = pickle.load(open(filename, "rb"), encoding='latin1')
    print("Loaded Checkpointed Results")
except Exception as e:
    results = pd.DataFrame(columns=['auc', 'gene', 'model', 'num_genes', 'seed', 'train_size', 'time_elapsed'])
    print("Created a New Results Dictionary")


Loaded Checkpointed Results


In [4]:
gene_graph = GeneManiaGraph()

Torrent name: genemania.pkl, Size: 9.61MB


In [113]:
from models.graph_layers import setup_aggregates, hierarchical_clustering
from models.model_wrapper import EmbeddingLayer
nb_genes = 2000
neighbors = gene_graph.bfs_sample_neighbors("RPL3", nb_genes)
adj = nx.to_scipy_sparse_matrix(neighbors)

adjs, centroids = setup_aggregates(adj, 3)

In [128]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=50, test_size=1000, random_state=1)
X_train = X_train[list(neighbors.nodes)].copy()
inputs = torch.FloatTensor(np.expand_dims(X_train, axis=2))[0: 10]
inputs = Variable(inputs, requires_grad=False).float()
embedded_inputs = EmbeddingLayer(nb_genes)(inputs)


In [129]:
embedded_inputs = embedded_inputs.permute(0, 2, 1).contiguous()
embedded_inputs = embedded_inputs.view(embedded_inputs.shape[0] * embedded_inputs.shape[1], -1)


In [152]:
embedded_inputs = embedded_inputs.cuda()

In [151]:
%timeit embedded_inputs[:, adjs[0][centroids[0][0]].nonzero()[1]].max(dim=0)

28.4 ms ± 739 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [123]:
%timeit centroids[0][0]

54.1 ns ± 0.606 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [116]:
def sparse_max_pool(adj, centroids, x):
    temp = []
    x = x.permute(0, 2, 1).contiguous()
    shape = x.shape
    x = x.view(x.shape[0] * x.shape[1], -1)
    for i in range(int(adj.shape[0] / 2)):
        neighbors = centroids[i]
        if len(adj[neighbors].nonzero()[1]) != 0:
            temp.append(x[:, adj[neighbors].nonzero()[1]].max(dim=1)[0])
        else:
            temp.append(x[:, neighbors])
    x = torch.stack(temp)
    x = x.view(shape[0], shape[1], -1).permute(0, 2, 1).contiguous()  # put back in ex, node, channel
    return x


In [153]:
%timeit sparse_max_pool(adjs[0], centroids[0], inputs)

2.41 s ± 26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
search_num_genes=[50, 100, 200, 300, 500, 1000, 2000, 4000, 8000, 16000]
test_size=1000
search_train_size=[50]
cuda=False
trials=5
search_genes = ["RPL4", "RPL5", "RPS10", "RPS3", "CEBPD", "IL5", "PABPC3", "PSMB10", "S100A8", "S100A9", "TOP1", "C15orf40", "RNF138", "DLGAP2", "EVI2B", "ZFP82", "MYBL2", "PSMB1", "CISD1", "HLA-B", "SAA2", "IFIT1", "RPS3A", "TP53", "TNF", "EGFR"]
model_list = [       
               GCN(name="GCN_lay3_chan64_emb32_dropout", cuda=cuda, num_layer=3, channels=64, embedding=32, pooling="hierarchy"),
#               GCN(name="GCN_lay20_chan32_emb32_dropout_pool_hierarchy", cuda=cuda, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="hierarchy"),
              #GCN(name="GCN_lay20_chan32_emb32_dropout_pool_random", cuda=cuda, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="random"),
#               GCN(name="GCN_lay3_chan64_emb32_dropout", cuda=cuda, num_layer=3, channels=64, embedding=32),
#               MLP(name="MLP_lay2_chan512_dropout", cuda=cuda, dropout=True, num_layer=2, channels=512),
#               MLP(name="MLP_lay2_chan512", cuda=cuda, dropout=False, num_layer=2, channels=512),
#               SLR(name="SLR_lambda1_l11", cuda=cuda)
             ]

In [7]:
# Create the set of all experiment ids and see which are left to do
model_names = [model.name for model in model_list]
columns = ["gene", "model", "num_genes", "train_size", "seed"]
all_exp_ids = [x for x in itertools.product(search_genes, model_names, search_num_genes, search_train_size, range(trials))]
all_exp_ids = pd.DataFrame(all_exp_ids, columns=columns)
all_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in all_exp_ids.itertuples(name=None)]
results_exp_ids = results[columns].copy()
results_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in results_exp_ids.itertuples(name=None)]
intersection_ids = all_exp_ids.index.intersection(results_exp_ids.index)
todo = all_exp_ids.drop(intersection_ids).to_dict(orient="records")
print("todo: " + str(len(todo)))
print("done: " + str(len(results)))

todo: 1293
done: 7


In [8]:
def get_every_n(a, n=2):
    for i in range(a.shape[0] // 2):
        yield a[2*i:2*(i+1)]


In [9]:
for row in todo:
    print(row)
    start_time = time.time()
    gene = row["gene"]
    model_name = row["model"]
    seed = row["seed"]
    num_genes = row["num_genes"] if row["num_genes"] < 10000 else 16300
    train_size = row["train_size"]

    model = [x for x in model_list if x.name == model_name][0]

    experiment = {
        "gene": gene,
        "model": model.name,
        "num_genes": num_genes,
        "train_size": train_size,
        "seed": seed,
    }

    dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int")
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=seed)
    
    if num_genes == 16300:
        neighbors = gene_graph.nx_graph
    else:
        neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

    X_train = X_train[list(neighbors.nodes)].copy()
    X_test = X_test[list(neighbors.nodes)].copy()
    X_train[gene] = 1
    X_test[gene] = 1
    adj = np.asarray(nx.to_numpy_matrix(neighbors))
    model.fit(X_train, y_train, adj=adj)
    
    x_test = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
    if cuda:
        x_test = x_test.cuda()
    
    y_hat = []
    for chunk in get_every_n(x_test, 10):
        y_hat.extend(model.predict(chunk)[:,1].data.cpu().numpy().tolist())
    auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten())

    del model
    experiment["auc"] = auc
    experiment["time_elapsed"] = str(time.time() - start_time)
    results = record_result(results, experiment, filename)
    print("time elapsed for genes: " +str(num_genes) + " : " + str(time.time() - start_time))


{'seed': 2, 'train_size': 50, 'model': 'GCN_lay3_chan64_emb32_dropout', 'gene': 'RPL4', 'num_genes': 100}
> /home/martinweiss/gene-graph-conv/models/graph_layers.py(91)forward()
-> x = sparse_max_pool(self.csr_adj, self.centroids, x)


(Pdb)  self.csr_adj


<100x100 sparse matrix of type '<class 'numpy.float32'>'
	with 3702 stored elements in Compressed Sparse Row format>


(Pdb)  self.centroids


defaultdict(<class 'list'>, {0: array([0]), 1: array([1]), 2: array([ 2, 58, 75]), 3: array([3]), 4: array([4]), 5: array([ 5,  6,  9, 12, 14, 23, 24, 26, 30, 32, 33, 34, 35, 41, 45, 46, 48,
       52, 53, 57, 62, 68, 70, 71, 86, 89, 90, 92, 99]), 6: array([ 5,  6,  9, 12, 14, 23, 24, 26, 30, 32, 33, 34, 35, 41, 45, 46, 48,
       52, 53, 57, 62, 68, 70, 71, 86, 89, 90, 92, 99]), 7: array([7]), 8: array([ 8, 22, 27]), 9: array([ 5,  6,  9, 12, 14, 23, 24, 26, 30, 32, 33, 34, 35, 41, 45, 46, 48,
       52, 53, 57, 62, 68, 70, 71, 86, 89, 90, 92, 99]), 10: array([10, 13, 31, 49]), 11: array([11, 17, 43, 59, 91]), 12: array([ 5,  6,  9, 12, 14, 23, 24, 26, 30, 32, 33, 34, 35, 41, 45, 46, 48,
       52, 53, 57, 62, 68, 70, 71, 86, 89, 90, 92, 99]), 13: array([10, 13, 31, 49]), 14: array([ 5,  6,  9, 12, 14, 23, 24, 26, 30, 32, 33, 34, 35, 41, 45, 46, 48,
       52, 53, 57, 62, 68, 70, 71, 86, 89, 90, 92, 99]), 15: array([15]), 16: array([16]), 17: array([11, 17, 43, 59, 91]), 18: array([18

(Pdb)  q


BdbQuit: 

In [10]:
results

Unnamed: 0,auc,gene,model,num_genes,seed,train_size,time_elapsed
0,0.575046,RPL4,GCN_lay3_chan64_emb32_dropout,50,0,50,35.51251173019409
1,0.5,RPL4,GCN_lay3_chan64_emb32_dropout,50,1,50,6.390526533126831
2,0.544952,RPL4,GCN_lay3_chan64_emb32_dropout,50,2,50,3.7114007472991934
3,0.500721,RPL4,GCN_lay3_chan64_emb32_dropout,50,3,50,3.427546977996826
4,0.497917,RPL4,GCN_lay3_chan64_emb32_dropout,50,4,50,3.657351016998291
5,0.466869,RPL4,GCN_lay3_chan64_emb32_dropout,100,0,50,3.0078303813934326
6,0.469764,RPL4,GCN_lay3_chan64_emb32_dropout,100,1,50,2.421742677688598
7,0.5,RPL4,GCN_lay3_chan64_emb32_dropout,100,2,50,4.3778393268585205
8,0.5,RPL4,GCN_lay3_chan64_emb32_dropout,100,3,50,4.056507587432861
9,0.476294,RPL4,GCN_lay3_chan64_emb32_dropout,100,4,50,2.893102169036865


TypeError: norm_laplacian() missing 1 required positional argument: 'n_clusters'

In [18]:
adjs

NameError: name 'reload' is not defined

In [6]:
from collections import defaultdict

In [84]:
clusters = sklearn.cluster.AgglomerativeClustering(n_clusters=20, affinity='euclidean',
                                                     memory='.cache', connectivity=adj,
                                                     compute_full_tree='auto', linkage='ward').fit_predict(adj.toarray())


In [11]:
clusters = np.load(".cache/kmeans1f5b9922ed8e7a7b61c15e7442c799be6d22d58a83bd6f68c4bfc9c726613d088150.npy")


In [85]:
nx_adj = nx.OrderedGraph(nx.from_scipy_sparse_matrix(adj))
cluster_dict = defaultdict(list)


In [13]:
n_clusters = len(set(clusters))


In [144]:
coo

<50x50 sparse matrix of type '<class 'numpy.float64'>'
	with 784 stored elements in COOrdinate format>

In [142]:
new = sparse.csr_matrix((n_clusters, n_clusters))

In [184]:
coo.tocsr()

<50x50 sparse matrix of type '<class 'numpy.float64'>'
	with 141 stored elements in Compressed Sparse Row format>

In [180]:
cleaned = defaultdict(list)
for i, cluster in enumerate(clusters):
    cleaned[i] = min(np.argwhere(clusters == cluster).flatten())
    
coo = adj.tocoo()
for i, col in enumerate(coo.__dict__["col"]):
    coo.__dict__["col"][i] = cleaned[col]
for i, row in enumerate(coo.__dict__["row"]):
    coo.__dict__["row"][i] = cleaned[row]
coo.tocsr()

<50x50 sparse matrix of type '<class 'numpy.float64'>'
	with 141 stored elements in Compressed Sparse Row format>

defaultdict(list,
            {0: array([0]),
             1: array([1]),
             2: array([ 2,  3,  4,  5,  9, 10, 11, 13, 15, 20, 22, 27, 28, 31, 32, 33, 34,
                    36, 42, 43, 44, 46]),
             3: array([ 2,  3,  4,  5,  9, 10, 11, 13, 15, 20, 22, 27, 28, 31, 32, 33, 34,
                    36, 42, 43, 44, 46]),
             4: array([ 2,  3,  4,  5,  9, 10, 11, 13, 15, 20, 22, 27, 28, 31, 32, 33, 34,
                    36, 42, 43, 44, 46]),
             5: array([ 2,  3,  4,  5,  9, 10, 11, 13, 15, 20, 22, 27, 28, 31, 32, 33, 34,
                    36, 42, 43, 44, 46]),
             6: array([ 6,  7, 37, 39]),
             7: array([ 6,  7, 37, 39]),
             8: array([ 8, 30, 45, 49]),
             9: array([ 2,  3,  4,  5,  9, 10, 11, 13, 15, 20, 22, 27, 28, 31, 32, 33, 34,
                    36, 42, 43, 44, 46]),
             10: array([ 2,  3,  4,  5,  9, 10, 11, 13, 15, 20, 22, 27, 28, 31, 32, 33, 34,
                    36, 42, 43, 44, 46]),
    

In [101]:
adj.toarray()

array([0.    , 0.06  , 0.013 , 0.011 , 0.012 , 0.0016, 0.01  , 0.011 ,
       0.0079, 0.0033, 0.0011, 0.0084, 0.029 , 0.0039, 0.092 , 0.012 ,
       0.014 , 0.015 , 0.053 , 0.11  , 0.014 , 0.01  , 0.002 , 0.037 ,
       0.12  , 0.094 , 0.012 , 0.019 , 0.0052, 0.034 , 0.0073, 0.019 ,
       0.0021, 0.0018, 0.0043, 0.046 , 0.0024, 0.0079, 0.011 , 0.0069,
       0.016 , 0.043 , 0.014 , 0.0038, 0.018 , 0.0071, 0.0059, 0.012 ,
       0.0074, 0.012 ])

In [105]:
coo.toarray()[19]

array([0.0358, 0.042 , 0.0343, 0.012 , 0.1778, 0.094 , 0.0194, 0.046 ,
       0.043 , 0.11  , 0.06  , 0.092 , 0.12  , 0.053 , 0.01  , 0.014 ,
       0.037 , 0.029 , 0.034 , 0.    , 0.    , 0.    , 0.    , 0.    ,
       0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
       0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
       0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    , 0.    ,
       0.    , 0.    ])

In [25]:
cluster_adj = sparse.csr_matrix(np.zeros((n_clusters, adj.shape[0])))

start = time.time()

cluster_dict = defaultdict(list)
for i, cluster in enumerate(clusters):
    for index in np.argwhere(clusters == cluster):
        cluster_adj[cluster] += adj[index]

print(start - time.time())

  


KeyboardInterrupt: 

In [None]:
cluster_adj

NameError: name 'n_clusters' is not defined

NameError: name 'clusters' is not defined

In [51]:
cluster_dict = defaultdict(list)
for i, cluster in enumerate(ids):
    cluster_dict[cluster].append(i)


In [52]:
for key, val in cluster_dict.items():
    first_node = val[0]
    for other_node in val[1:]:
        adj = nx.contracted_nodes(adj, first_node, other_node)


KeyboardInterrupt: 

In [55]:
adj = nx.to_scipy_sparse_matrix(adj)


In [69]:
clusters[clusters == 4560]

AttributeError: 'numpy.ndarray' object has no attribute 'indices'

In [62]:
cluster_adj = np.zeros((n_clusters, adj.shape[0]))
for i, cluster in enumerate(clusters):
    cluster_adj[cluster] += adj[i]

In [63]:
cluster_adj

(8150, 16300)

In [33]:
for key, val in cluster_dict.items():
    first_node = val[0]
    for other_node in val[1:]:
        uf.union(first_node, other_node)


In [149]:
#neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)
adj = nx.to_scipy_sparse_matrix(gene_graph.nx_graph)


In [None]:
ids = sklearn.cluster.AgglomerativeClustering(n_clusters=20, affinity='euclidean',
                                                     memory='.cache', connectivity=adj,
                                                     compute_full_tree='auto', linkage='ward').fit_predict(adj)


In [131]:
from scipy import sparse
from collections import defaultdict
nx_adj = nx.OrderedGraph(nx.from_scipy_sparse_matrix(adj))
clustered = nx_adj
cluster_dict = defaultdict(list)
for i, cluster in enumerate(ids):
    cluster_dict[cluster].append(i)
for key, val in cluster_dict.items():
    first_node = val[0]
    for other_node in val[1:]:
        clustered = nx.contracted_nodes(clustered, first_node, other_node)


In [138]:
nx.to_numpy_matrix(clustered)

matrix([[0.     , 0.032  , 0.0074 , 0.11   , 0.0061 , 0.0031 , 0.015  ,
         0.01   , 0.04   , 0.011  , 0.0023 , 0.047  , 0.11   , 0.014  ,
         0.0058 , 0.013  , 0.0065 , 0.0043 , 0.0056 , 0.015  ],
        [0.032  , 0.     , 0.     , 0.     , 0.     , 0.016  , 0.     ,
         0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
         0.034  , 0.     , 0.     , 0.     , 0.     , 0.     ],
        [0.0074 , 0.     , 0.     , 0.     , 0.0072 , 0.0077 , 0.     ,
         0.013  , 0.     , 0.013  , 0.011  , 0.077  , 0.     , 0.019  ,
         0.007  , 0.019  , 0.0078 , 0.     , 0.0067 , 0.018  ],
        [0.11   , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
         0.     , 0.     , 0.     , 0.     , 0.     , 0.     , 0.     ,
         0.     , 0.     , 0.     , 0.     , 0.     , 0.     ],
        [0.0061 , 0.     , 0.0072 , 0.     , 0.0035 , 0.003  , 0.0086 ,
         0.0043 , 0.     , 0.011  , 0.0023 , 0.0081 , 0.     , 0.017  ,
         0.0057 , 0.016 

NodeView((0, 3))

In [13]:
clusters = np.array([13, 19, 14, 23,  4,  0,  0, 20,  2, 24,  0,  0,  0,  0,  6, 21, 17,
        3,  1, 12, 15,  1,  2, 11,  0, 18,  0,  2,  5,  8,  0,  0,  4,  0,
        0,  0, 16,  9,  1,  7, 22,  0, 10,  3,  1,  6,  0,  5,  0,  4])
adj.shape
new_adj = np.zeros((25, 25))
for i, cluster in enumerate(clusters):
    new_adj[cluster] = 

(50, 50)

In [7]:
row = todo[0]
print(row)
start_time = time.time()
gene = row["gene"]
model_name = row["model"]
seed = row["seed"]
num_genes = 2000
train_size = row["train_size"]

model = [x for x in model_list if x.name == model_name][0]

experiment = {
    "gene": gene,
    "model": model.name,
    "num_genes": num_genes,
    "train_size": train_size,
    "seed": seed,
}

dataset.labels = dataset.df[gene].where(dataset.df[gene] > 0).notnull().astype("int")
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset.df, dataset.labels, stratify=dataset.labels, train_size=train_size, test_size=test_size, random_state=seed)

neighbors = gene_graph.bfs_sample_neighbors(gene, num_genes)

X_train = X_train[list(neighbors.nodes)].copy()
X_test = X_test[list(neighbors.nodes)].copy()
X_train[gene] = 1
X_test[gene] = 1
adj = np.asarray(nx.to_numpy_matrix(neighbors))


{'model': 'GCN_lay1_chan64_emb32_dropout_agg', 'seed': 0, 'num_genes': 4000, 'gene': 'IL5', 'train_size': 50}


In [None]:

for plot_gene in search_genes:

    %matplotlib inline
    plt.rcParams['figure.figsize'] = (7.5, 3.6)
    plot_train_size = 50

    subset = results[(results.train_size==plot_train_size) & 
                      (results.gene==plot_gene) & 
                      (results.num_genes!=400) &      
                      (results.num_genes> 0)]


    q = subset.groupby(['model','num_genes'])['auc']

    todo = list(subset["model"].unique())
    linestyles = ['-', '-', '--', '-.', ':']
    for ls, model in enumerate(sorted(todo)):
        index = list(q.mean()[model].index)
        mean = q.mean()[model]
        stderr = q.std()[model]/np.sqrt(q.count()[model])
        displayname = model.replace("CGN","GCN")
        displayname = displayname.replace("SLR", "SNLR")
        plt.errorbar(index, mean,label=displayname, xerr=0, yerr=stderr, ls=linestyles[ls])

    plt.title("Gene Inference " + plot_gene + " (train_size:" + str(plot_train_size) +")")
    plt.ylabel("AUC")
    plt.xlabel("Number of genes")
    plt.xscale("log")
    plt.xticks(sorted(subset["num_genes"].unique()))
    formatter = matplotlib.ticker.ScalarFormatter()
    plt.gca().xaxis.set_major_formatter(formatter)

    plt.legend();
    fd = len(list(gene_graph.nx_graph.neighbors(plot_gene)))
    print fd
    if fd > 50:
        plt.axvline(fd, ymin=0.4, ymax=1.0, c="black")
        c = plt.ylim()
        plt.text(fd*1.05,c[1]-((c[1]-c[0])*0.2),'First Degree',rotation=90)


    plt.savefig("experiments/results/sgi-" + plot_gene + "-" + "train" + str(plot_train_size) + ".png", bbox_inches='tight')
    plt.show()