In [2]:
import os 
import sys
import pickle
import networkx as nx
import pandas as pd
import numpy as np

import itertools
import sklearn
import torch
import datetime
import matplotlib, matplotlib.pyplot as plt
from collections import defaultdict

from torch.autograd import Variable
from models.model_wrapper import MLP
from data import datasets
from data.gene_graphs import GeneManiaGraph, RegNetGraph
from data.utils import record_result
from data.clinical.datasets import TCGADataset, Task
from data.clinical import taskloader
from data.clinical import split_dataset

from models.model_wrapper import MLP, GCN, SLR

In [3]:
tcga = TCGADataset()
task_id = "_EVENT-BRCA"
tasks = [Task(tcga, task_id, limit=1000)]

Torrent name: TCGA_tissue_ppi.hdf5, Size: 1748.32MB
Checking for pieces on disk: |██████████████████████████████████████████████████| 100.0% 
Found 1668 finished pieces out of 1668 total pieces.
Found dataset at /Users/martinweiss/.academictorrents-datastore/TCGA_tissue_ppi.hdf5


In [4]:
graphs = {"genemania": GeneManiaGraph()}


Torrent name: genemania.pkl, Size: 9.61MB
Checking for pieces on disk: |██████████████████████████████████████████████████| 100.0% 
Found 294 finished pieces out of 294 total pieces.
Found dataset at /Users/martinweiss/.academictorrents-datastore/genemania.pkl


In [5]:
# Setup the results dictionary
filename = "experiments/results/graph-gen-clinical.pkl"
try:
    results = pickle.load(open(filename, "rb"), encoding='latin1')
    print("Loaded Checkpointed Results")
except Exception as e:
    print(e)
    results = pd.DataFrame(columns=['task', 'auc', 'gene', 'model', 'graph', 'seed', 'train_size', 'optimize_graph_results'])
    print("Created a New Results Dictionary")


Loaded Checkpointed Results


In [6]:
train_size = 50
test_size = 500
trials = 3
cuda = False
models = [           
              GCN(name="GCN_lay20_chan32_emb32_dropout_pool", cuda=cuda, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="hierarchy"),
              GCN(name="GCN_lay3_chan64_emb32_dropout", cuda=cuda, num_layer=3, channels=64, embedding=32),
              MLP(name="MLP_lay2_chan512_dropout", cuda=cuda, dropout=True, num_layer=2, channels=512),
              MLP(name="MLP_lay2_chan512", cuda=cuda, dropout=False, num_layer=2, channels=512),
              SLR(name="SLR_lambda1_l11", cuda=cuda)
             ]

In [7]:
# Create the set of all experiment ids and see which are left to do
columns = ["task", "graph", "model", "seed", "train_size"]
all_exp_ids = [x for x in itertools.product(tasks, graphs.keys(), models, range(trials), [train_size])]
all_exp_ids = pd.DataFrame(all_exp_ids, columns=columns)
all_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in all_exp_ids.itertuples(name=None)]
results_exp_ids = results[columns].copy()
results_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in results_exp_ids.itertuples(name=None)]
intersection_ids = all_exp_ids.index.intersection(results_exp_ids.index)
todo = all_exp_ids.drop(intersection_ids).to_dict(orient="records")

print("todo: " + str(len(todo)))
print("done: " + str(len(results)))

todo: 15
done: 2


In [8]:
def optimize_graph(model, gene_graph, gene, X_train, X_test, y_train, y_test):
    neighbors = list(gene_graph.first_degree(gene)[0])
    neighbors = [n for n in neighbors if n in X_train.columns.values]
    res = {}
    for neighbor in neighbors:
        candidate_nodes = neighbors if gene == neighbor else set(neighbors) - set(neighbor)
        x_train = X_train.loc[:, candidate_nodes].copy()
        x_test = X_test.loc[:, candidate_nodes].copy()
        x_train[gene] = 1
        x_test[gene] = 1

        try:
            model.fit(x_train, y_train)
            x_test = Variable(torch.FloatTensor(np.expand_dims(x_test.values, axis=2)), requires_grad=False).float()
            if cuda:
                x_test = x_test.cuda()
            y_hat = model.predict(x_test)[:, 1].data.cpu().numpy()
            auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten())
            res[neighbor] = auc
            model.best_model = None # cleanup

        except Exception as e:
            print(e)
    return res


In [9]:
optimize_graph_results = []

for row in todo:
    if len(results) % 10 == 0:
        print(len(results))
    task = row["task"]
    graph_name = row["graph"]
    seed = row["seed"]
    model = row["model"]

    experiment = {
        "task": task.id,
        "model": model.name,
        "graph": graph_name,
        "seed": seed,
        "train_size": train_size,
        "optimize_graph_results": None,
    }

    try:
        X_train, X_test, y_train, y_test = sklearn.model_selection.\
            train_test_split(task.data, task.labels, stratify=task.labels, 
                             train_size=train_size, test_size=test_size)
    except ValueError:
        import pdb; pdb.set_trace()
        results = record_result(results, experiment, filename)
        continue

    X_train = X_train.copy()
    X_test = X_test.copy()

    try:
        gene_graph = graphs[graph_name]
        adj = np.asarray(nx.to_numpy_matrix(gene_graph.nx_graph))
        import pdb; pdb.set_trace()
        model.fit(X_train, y_train.astype("uint8"), adj=adj)

        x_test = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
        if cuda:
            x_test = x_test.cuda()
        y_hat = model.predict(x_test)[:, 1].data.cpu().numpy()
        auc = sklearn.metrics.roc_auc_score(y_test.astype("uint8"), np.asarray(y_hat).flatten())
        model.best_model = None # cleanup
        experiment["auc"] = auc
    except Exception as e:
        import pdb; pdb.set_trace()
        print(e)
        
    results = record_result(results, experiment, filename)


> <ipython-input-9-b1ce36ff66c3>(36)<module>()
-> model.fit(X_train, y_train.astype("uint8"), adj=adj)


(Pdb)  s


--Call--
> /Users/martinweiss/code/academic/conv-graph/venv/lib/python3.5/site-packages/pandas/util/_decorators.py(136)wrapper()
-> @wraps(func)


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/venv/lib/python3.5/site-packages/pandas/util/_decorators.py(138)wrapper()
-> old_arg_value = kwargs.pop(old_arg_name, None)


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/venv/lib/python3.5/site-packages/pandas/util/_decorators.py(140)wrapper()
-> if new_arg_name is None and old_arg_value is not None:


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/venv/lib/python3.5/site-packages/pandas/util/_decorators.py(150)wrapper()
-> if old_arg_value is not None:


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/venv/lib/python3.5/site-packages/pandas/util/_decorators.py(178)wrapper()
-> return func(*args, **kwargs)


(Pdb)  l


173  	                           "not both").format(old_name=old_arg_name,
174  	                                              new_name=new_arg_name)
175  	                    raise TypeError(msg)
176  	                else:
177  	                    kwargs[new_arg_name] = new_arg_value
178  ->	            return func(*args, **kwargs)
179  	        return wrapper
180  	    return _deprecate_kwarg
181  	
182  	
183  	def rewrite_axis_style_signature(name, extra_params):


(Pdb)  n


--Return--
> /Users/martinweiss/code/academic/conv-graph/venv/lib/python3.5/site-packages/pandas/util/_decorators.py(178)wrapper()->TCGA-BH-A0HF-...0
dtype: uint8
-> return func(*args, **kwargs)


(Pdb)  


--Call--
> /Users/martinweiss/code/academic/conv-graph/models/model_wrapper.py(55)fit()
-> def fit(self, X, y, adj=None):


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/model_wrapper.py(56)fit()
-> self.adj = adj


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/model_wrapper.py(57)fit()
-> self.X = X


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/model_wrapper.py(58)fit()
-> self.setup_layers()


(Pdb)  s


--Call--
> /Users/martinweiss/code/academic/conv-graph/models/model_wrapper.py(311)setup_layers()
-> def setup_layers(self):


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/model_wrapper.py(312)setup_layers()
-> adj_transforms, aggregate_adj = get_transform(self.adj, self.on_cuda, num_layer=self.num_layer, pooling=self.pooling)


(Pdb)  s


--Call--
> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(409)get_transform()
-> def get_transform(adj, cuda, add_self=True, norm_adj=True, num_layer=1, pooling="ignore"):


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(416)get_transform()
-> adj_transforms = []


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(417)get_transform()
-> if add_self:


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(418)get_transform()
-> logging.info("Adding self connection to the graph...")


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(419)get_transform()
-> adj_transforms += [lambda layer_id: SelfConnection(add_self, please_ignore=False)]


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(421)get_transform()
-> if norm_adj:


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(422)get_transform()
-> logging.info("Normalizing the graph...")


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(423)get_transform()
-> adj_transforms += [lambda layer_id: ApprNormalizeLaplacian()]


(Pdb)  s


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(425)get_transform()
-> adj_transforms = transforms.Compose(adj_transforms)


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(426)get_transform()
-> aggregator = AggregationGraph(adj, num_layer, adj_transforms=adj_transforms, cuda=cuda, cluster_type=pooling)


(Pdb)  s


--Call--
> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(76)__init__()
-> def __init__(self, adj, nb_layer, adj_transforms=None, cuda=False, cluster_type=None, **kwargs):


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(78)__init__()
-> self.nb_layer = nb_layer


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(79)__init__()
-> self.adj = adj


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(80)__init__()
-> self.cuda = cuda


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(81)__init__()
-> self.adj_transforms = adj_transforms


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(82)__init__()
-> self.cluster_type = cluster_type


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(87)__init__()
-> all_to_keep = []  # At each agregation, which node to keep.


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(88)__init__()
-> all_aggregate_adjs = []  # At each agregation, which node are connected to whom.


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(89)__init__()
-> all_transformed_adj = []  # At each layer, the transformed adj (normalized, etc.


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(90)__init__()
-> last_to_keep = np.ones((self.adj.shape[0]))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(91)__init__()
-> current_adj = self.adj.copy()


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(94)__init__()
-> for layer_id in range(self.nb_layer):


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(95)__init__()
-> if self.adj_transforms:  # Transform the adj if necessary.


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(96)__init__()
-> current_adj = self.adj_transforms(layer_id)(current_adj)


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(98)__init__()
-> all_transformed_adj.append(sparse.csr_matrix(current_adj))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(100)__init__()
-> nb_nodes = current_adj.shape[0]


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(101)__init__()
-> ids = range(current_adj.shape[0])


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(102)__init__()
-> if self.cluster_type == "hierarchy":


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(103)__init__()
-> n_clusters = int(nb_nodes / (2 ** (layer_id + 1)))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(104)__init__()
-> temp_sparse_adj = sparse.csr_matrix(adj)


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(105)__init__()
-> adj_hash = joblib.hash(temp_sparse_adj.data.tostring()) + joblib.hash(temp_sparse_adj.indices.tostring()) + joblib.hash(sparse.csr_matrix(current_adj).data.tostring()) + joblib.hash(sparse.csr_matrix(current_adj).indices.tostring()) + str(n_clusters)


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(106)__init__()
-> processed_path = ".cache/" + '{}.npy'.format(adj_hash)


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(107)__init__()
-> if os.path.isfile(processed_path):


(Pdb)  processed_path


'.cache/5a6e169dccfd78cbe322ef23a8e27d9e6d22d58a83bd6f68c4bfc9c726613d081f5b9922ed8e7a7b61c15e7442c799be6d22d58a83bd6f68c4bfc9c726613d088150.npy'


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(108)__init__()
-> ids = np.load(processed_path)


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(114)__init__()
-> n_clusters = len(set(ids))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(115)__init__()
-> clusters = set([])


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(116)__init__()
-> to_keep = np.zeros((current_adj.shape[0],))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(117)__init__()
-> cluster_adj = np.zeros((n_clusters, self.adj.shape[0]))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(119)__init__()
-> for i, cluster in enumerate(ids):


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(120)__init__()
-> if last_to_keep[i] == 1.:  # To keep a node, it had to be a centroid of a previous layer. Otherwise it might not work.


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(121)__init__()
-> if cluster not in clusters:


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(122)__init__()
-> clusters.add(cluster)


(Pdb)  l


117  	            cluster_adj = np.zeros((n_clusters, self.adj.shape[0]))
118  	
119  	            for i, cluster in enumerate(ids):
120  	                if last_to_keep[i] == 1.:  # To keep a node, it had to be a centroid of a previous layer. Otherwise it might not work.
121  	                    if cluster not in clusters:
122  ->	                        clusters.add(cluster)
123  	                        to_keep[i] = 1.
124  	                cluster_adj[cluster] += current_adj[i]  # The centroid is the merged of all the adj of all the nodes inside it.
125  	
126  	            # rewrite the adj matrix.
127  	            new_adj = np.zeros((current_adj.shape[0], current_adj.shape[0]))


(Pdb)  l


128  	            for i, cluster in enumerate(ids):
129  	                new_adj[i] += (cluster_adj[cluster] > 0.).astype(int)
130  	
131  	            all_to_keep.append(to_keep)
132  	            all_aggregate_adjs.append(sparse.csr_matrix(new_adj))
133  	            current_adj = new_adj
134  	
135  	        self.to_keeps = all_to_keep
136  	        self.aggregate_adjs = all_aggregate_adjs
137  	        self.adjs = all_transformed_adj
138  	


(Pdb)  until 133


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(133)__init__()
-> current_adj = new_adj


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(94)__init__()
-> for layer_id in range(self.nb_layer):


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(95)__init__()
-> if self.adj_transforms:  # Transform the adj if necessary.


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(96)__init__()
-> current_adj = self.adj_transforms(layer_id)(current_adj)


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(98)__init__()
-> all_transformed_adj.append(sparse.csr_matrix(current_adj))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(100)__init__()
-> nb_nodes = current_adj.shape[0]


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(101)__init__()
-> ids = range(current_adj.shape[0])


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(102)__init__()
-> if self.cluster_type == "hierarchy":


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(103)__init__()
-> n_clusters = int(nb_nodes / (2 ** (layer_id + 1)))


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(104)__init__()
-> temp_sparse_adj = sparse.csr_matrix(adj)


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(105)__init__()
-> adj_hash = joblib.hash(temp_sparse_adj.data.tostring()) + joblib.hash(temp_sparse_adj.indices.tostring()) + joblib.hash(sparse.csr_matrix(current_adj).data.tostring()) + joblib.hash(sparse.csr_matrix(current_adj).indices.tostring()) + str(n_clusters)


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(106)__init__()
-> processed_path = ".cache/" + '{}.npy'.format(adj_hash)


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(107)__init__()
-> if os.path.isfile(processed_path):


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(110)__init__()
-> ids = sklearn.cluster.AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',


(Pdb)  


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(111)__init__()
-> memory='.cache', connectivity=(current_adj > 0.).astype(int),


(Pdb)  processed_path


'.cache/5a6e169dccfd78cbe322ef23a8e27d9e6d22d58a83bd6f68c4bfc9c726613d08eec8273b3e597109b15df7f5527ea0dd59679b37823c54a736614d6652e961264075.npy'


(Pdb)  n_clusters


4075


(Pdb)  n


> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(112)__init__()
-> compute_full_tree='auto', linkage='ward').fit_predict(adj)


(Pdb)  n


  affinity='euclidean')


KeyboardInterrupt
> /Users/martinweiss/code/academic/conv-graph/models/graph_layers.py(112)__init__()
-> compute_full_tree='auto', linkage='ward').fit_predict(adj)


(Pdb)  q


> <ipython-input-9-b1ce36ff66c3>(47)<module>()
-> print(e)


(Pdb)  q


BdbQuit: 