In [33]:
import time
import os
import sys
import pickle
import networkx as nx
import pandas as pd
import numpy as np

import itertools
import sklearn
import torch
import datetime
import matplotlib, matplotlib.pyplot as plt
from collections import defaultdict
from scipy import sparse

from torch.autograd import Variable
from models.model_wrapper import MLP
from data import datasets
from data.gene_graphs import GeneManiaGraph, RegNetGraph
from data.utils import record_result
from data.clinical.datasets import TCGADataset, Task
from data.clinical import taskloader
from data.clinical import split_dataset

from models.model_wrapper import MLP, GCN, SLR

In [198]:
tcga = TCGADataset()
task_ids = taskloader.get_all_tasks(tcga)
tasks = [Task(tcga, task_id, limit=1000) for task_id in task_ids]

Torrent name: TCGA_tissue_ppi.hdf5, Size: 1748.32MB
Checking for pieces on disk: |██████████████████████████████████████████████████| 100.0% 
Found 1668 finished pieces out of 1668 total pieces.
Found dataset at /Users/martinweiss/.academictorrents-datastore/TCGA_tissue_ppi.hdf5


In [202]:
X_train, X_test, y_train, y_test = sklearn.model_selection.\
    train_test_split(tasks[0].data, tasks[0].labels, stratify=tasks[0].labels, 
                     train_size=500, test_size=50)


In [204]:
x_train = Variable(torch.FloatTensor(np.expand_dims(X_train.values, axis=2)), requires_grad=False).float()


In [34]:
graphs = {"genemania": GeneManiaGraph()}

Torrent name: genemania.pkl, Size: 9.61MB


In [49]:
adj = sparse.csr_matrix(np.array(nx.to_numpy_matrix(graphs["genemania"].nx_graph)))
tensor_adj = torch.FloatTensor(adj.toarray())

In [123]:
coo_data=adj.tocoo()
indices=torch.LongTensor([coo_data.row,coo_data.col])
centroids = torch.sparse.LongTensor(torch.LongTensor(indices), torch.ones(adj.data.size), adj.shape)



tensor(indices=tensor([[    0,     0,     0,  ..., 16299, 16299, 16299],
                       [ 3303,  5612,  7651,  ..., 16133, 16268, 16285]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(16300, 16300), nnz=529314, layout=torch.sparse_coo)

In [116]:
centroids.coalesce().values().sum()

tensor(294)

In [289]:
adj.sum()

47760.914966891905

In [94]:
mask = torch.LongTensor((adj.toarray() > 0).astype(int))

In [72]:
shape = 16000
indices = torch.LongTensor([np.arange(shape), np.arange(shape)])
sparse_eye = torch.sparse.LongTensor(indices, torch.ones(shape), adj.shape) 

tensor(indices=tensor([[    0,     1,     2,  ..., 15997, 15998, 15999],
                       [    0,     1,     2,  ..., 15997, 15998, 15999]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(16300, 16300), nnz=16000, layout=torch.sparse_coo)

In [130]:
coo_data=adj.tocoo()
indices=torch.LongTensor([coo_data.row,coo_data.col])
mask = torch.sparse.LongTensor(torch.LongTensor(indices), torch.FloatTensor(adj.data), adj.shape)


In [54]:
coo_data=adj.tocoo()
indices=torch.LongTensor([coo_data.row,coo_data.col])
sparse_adj = torch.sparse.LongTensor(torch.LongTensor(indices), torch.FloatTensor(adj.data), (adj.data.shape[0], adj.shape[0]))
tensor_adj.sparse_mask(mask.coalesce())

tensor(indices=tensor([[    0,     0,     0,  ..., 16299, 16299, 16299],
                       [ 3303,  5612,  7651,  ..., 16133, 16268, 16285]]),
       values=tensor([0.2200, 0.0230, 0.2500,  ..., 0.2400, 0.0110, 0.1500]),
       size=(16300, 16300), nnz=529314, layout=torch.sparse_coo)

In [162]:
D = adj.sum(axis=0)
D_inv = sparse.diags(np.array(1. / np.sqrt(D))[0], 0)
norm_transform = D_inv.dot(adj).dot(D_inv)


In [241]:
adj.data.size

545614

In [231]:
np.array(adj.sum(axis=0))[0]

(16300,)

In [248]:
ids = np.load(".cache/5a6e169dccfd78cbe322ef23a8e27d9e6d22d58a83bd6f68c4bfc9c726613d081f5b9922ed8e7a7b61c15e7442c799be6d22d58a83bd6f68c4bfc9c726613d088150.npy")

In [270]:
cluster_adj = sparse.lil_matrix(np.eye((16300)))

In [288]:
adj[i].toarray()[-1]

array([0., 0., 0., ..., 0., 0., 1.])

In [298]:
len(ids)

8150

In [293]:
start = time.time()
coo_data=adj.tocoo()
indices=torch.LongTensor([coo_data.row,coo_data.col])
centroids = torch.sparse.LongTensor(torch.LongTensor(indices), torch.ones(adj.data.size), adj.shape)
print(time.time() - start)

0.16740798950195312


In [295]:
centroids

tensor(indices=tensor([[    0,     0,     0,  ..., 16299, 16299, 16299],
                       [    0,  3303,  5612,  ..., 16268, 16285, 16299]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(16300, 16300), nnz=545614, layout=torch.sparse_coo)

In [313]:
sparse.csr_matrix(cluster_adj).copy()

<8000x16000 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [290]:
start = time.time()
cluster_adj = np.zeros((8000, 16000))
print(time.time() - start)

0.019582033157348633


In [281]:
start = time.time()

for i, cluster in enumerate(ids):
    arr_cluster_adj[cluster] += adj[i]
print(time.time() - start)

2.8473799228668213


In [311]:
np.zeros(adj.shape[0]).shape


(16300,)

In [307]:
arr_cluster_adj[44].sum()

9.237959947437048

In [280]:
arr_cluster_adj = cluster_adj.toarray()

In [245]:
adj_hash = joblib.hash(adj.data.tostring()) + joblib.hash(adj.indices.tostring()) + joblib.hash(sparse.csr_matrix(current_adj).data.tostring()) + joblib.hash(sparse.csr_matrix(current_adj).indices.tostring()) + str(n_clusters)


NameError: name 'current_adj' is not defined

In [225]:
adj.indices[ids]

(545614,)

In [223]:
for i, cluster in enumerate(range(10)):
    np.array(adj.sum(axis=0))[0]

array([1.8646    , 1.9921    , 2.77770001, ..., 1.77799999, 1.98464   ,
       6.60234005])

In [141]:
np.diag(D)

array([1.8646])

In [142]:
adj_sparse.setdiag(1. / np.sqrt(D))

<16000x16000 sparse matrix of type '<class 'numpy.float64'>'
	with 16000 stored elements (1 diagonals) in DIAgonal format>

In [315]:
adj.setdiag(1.)

In [319]:
adj[0].toarray()[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [4]:
# Setup the results dictionary
filename = "experiments/results/clinical-tasks.pkl"
try:
    results = pickle.load(open(filename, "rb"), encoding='latin1')
    print("Loaded Checkpointed Results")
except Exception as e:
    print(e)
    results = pd.DataFrame(columns=['task', 'auc', 'gene', 'model', 'graph', 'seed', 'train_size'])
    print("Created a New Results Dictionary")


[Errno 2] No such file or directory: 'experiments/results/clinical-tasks.pkl'
Created a New Results Dictionary


In [5]:
train_size = 50
test_size = 500
trials = 3
cuda = False
models = [           
              GCN(name="GCN_lay20_chan32_emb32_dropout_pool_kmeans", cuda=cuda, dropout=True, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="kmeans"),
              GCN(name="GCN_lay20_chan32_emb32_dropout_pool_hierarchy", cuda=cuda, dropout=True, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="hierarchy"),
              GCN(name="GCN_lay20_chan32_emb32_dropout_pool_random", cuda=cuda, dropout=True,num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="random"),
              GCN(name="GCN_lay3_chan64_emb32_dropout", cuda=cuda, dropout=True, num_layer=3, channels=64, embedding=32),
              MLP(name="MLP_lay2_chan512_dropout", cuda=cuda, dropout=True, num_layer=2, channels=512),
              MLP(name="MLP_lay2_chan512", cuda=cuda, dropout=False, num_layer=2, channels=512),
              SLR(name="SLR_lambda1_l11", cuda=cuda)
             ]

In [6]:
# Create the set of all experiment ids and see which are left to do
columns = ["task", "graph", "model", "seed", "train_size"]
all_exp_ids = [x for x in itertools.product(tasks, graphs.keys(), models, range(trials), [train_size])]
all_exp_ids = pd.DataFrame(all_exp_ids, columns=columns)
all_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in all_exp_ids.itertuples(name=None)]
results_exp_ids = results[columns].copy()
results_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in results_exp_ids.itertuples(name=None)]
intersection_ids = all_exp_ids.index.intersection(results_exp_ids.index)
todo = all_exp_ids.drop(intersection_ids).to_dict(orient="records")

print("todo: " + str(len(todo)))
print("done: " + str(len(results)))

todo: 6111
done: 0


In [7]:
for row in todo:
#    if len(results) % 10 == 0:
    print(len(results))
    task = row["task"]
    graph_name = row["graph"]
    seed = row["seed"]
    model = row["model"]
    experiment = {
        "task": task.id,
        "model": model.name,
        "graph": graph_name,
        "seed": seed,
        "train_size": train_size,
    }

    try:
        X_train, X_test, y_train, y_test = sklearn.model_selection.\
            train_test_split(task.data, task.labels, stratify=task.labels, 
                             train_size=train_size, test_size=test_size)
    except ValueError:
        import pdb; pdb.set_trace()
        results = record_result(results, experiment, filename)
        continue

    X_train = X_train.copy()
    X_test = X_test.copy()
    try:
        gene_graph = graphs[graph_name]
        adj = np.asarray(nx.to_numpy_matrix(gene_graph.nx_graph))
        model.fit(X_train, y_train.astype("uint8"), adj=adj)

        x_test = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
        if cuda:
            x_test = x_test.cuda()
        y_hat = model.predict(x_test)[:, 1].data.cpu().numpy()
        auc = sklearn.metrics.roc_auc_score(y_test.astype("uint8"), np.asarray(y_hat).flatten())
        model.best_model = None # cleanup
        experiment["auc"] = auc
    except Exception as e:
        import pdb; pdb.set_trace()
        print(e)
        
    results = record_result(results, experiment, filename)


0


KeyboardInterrupt: 