In [9]:
import os 
import sys
import pickle
import networkx as nx
import pandas as pd
import numpy as np

import itertools
import sklearn
import torch
import datetime
import matplotlib, matplotlib.pyplot as plt
from collections import defaultdict

from torch.autograd import Variable
from models.model_wrapper import MLP
from data import datasets
from data.gene_graphs import GeneManiaGraph, RegNetGraph
from data.utils import record_result
from data.clinical.datasets import TCGADataset, Task
from data.clinical import taskloader
from data.clinical import split_dataset

from models.model_wrapper import MLP, GCN, SLR

In [2]:
tcga = TCGADataset()
task_id = "_EVENT-BRCA"
tasks = [Task(tcga, task_id, limit=1000)]

Torrent name: TCGA_tissue_ppi.hdf5, Size: 1748.32MB
Checking for pieces on disk: |██████████████████████████████████████████████████| 100.0% 
Found 1668 finished pieces out of 1668 total pieces.
Found dataset at /Users/martinweiss/.academictorrents-datastore/TCGA_tissue_ppi.hdf5


In [23]:
graphs = {"genemania": GeneManiaGraph()}


Torrent name: genemania.pkl, Size: 9.61MB


In [24]:
# Setup the results dictionary
filename = "experiments/results/graph-gen-clinical.pkl"
try:
    results = pickle.load(open(filename, "rb"), encoding='latin1')
    print("Loaded Checkpointed Results")
except Exception as e:
    print(e)
    results = pd.DataFrame(columns=['task', 'auc', 'gene', 'model', 'graph', 'seed', 'train_size', 'optimize_graph_results'])
    print("Created a New Results Dictionary")


Loaded Checkpointed Results


In [25]:
train_size = 50
test_size = 500
trials = 3
cuda = False
models = [           
              GCN(name="GCN_lay20_chan32_emb32_dropout_pool", cuda=cuda, num_layer=4, channels=32, embedding=32, prepool_extralayers=5, pooling="hierarchy"),
              GCN(name="GCN_lay3_chan64_emb32_dropout", cuda=cuda, num_layer=3, channels=64, embedding=32),
              MLP(name="MLP_lay2_chan512_dropout", cuda=cuda, dropout=True, num_layer=2, channels=512),
              MLP(name="MLP_lay2_chan512", cuda=cuda, dropout=False, num_layer=2, channels=512),
              SLR(name="SLR_lambda1_l11", cuda=cuda)
             ]

In [26]:
# Create the set of all experiment ids and see which are left to do
columns = ["task", "graph", "model", "seed", "train_size"]
all_exp_ids = [x for x in itertools.product(tasks, graphs.keys(), models, range(trials), [train_size])]
all_exp_ids = pd.DataFrame(all_exp_ids, columns=columns)
all_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in all_exp_ids.itertuples(name=None)]
results_exp_ids = results[columns].copy()
results_exp_ids.index = ["-".join(map(str, tup[1:])) for tup in results_exp_ids.itertuples(name=None)]
intersection_ids = all_exp_ids.index.intersection(results_exp_ids.index)
todo = all_exp_ids.drop(intersection_ids).to_dict(orient="records")

print("todo: " + str(len(todo)))
print("done: " + str(len(results)))

todo: 15
done: 2


In [27]:
def optimize_graph(model, gene_graph, gene, X_train, X_test, y_train, y_test):
    neighbors = list(gene_graph.first_degree(gene)[0])
    neighbors = [n for n in neighbors if n in X_train.columns.values]
    res = {}
    for neighbor in neighbors:
        candidate_nodes = neighbors if gene == neighbor else set(neighbors) - set(neighbor)
        x_train = X_train.loc[:, candidate_nodes].copy()
        x_test = X_test.loc[:, candidate_nodes].copy()
        x_train[gene] = 1
        x_test[gene] = 1

        try:
            model.fit(x_train, y_train)
            x_test = Variable(torch.FloatTensor(np.expand_dims(x_test.values, axis=2)), requires_grad=False).float()
            if cuda:
                x_test = x_test.cuda()
            y_hat = model.predict(x_test)[:, 1].data.cpu().numpy()
            auc = sklearn.metrics.roc_auc_score(y_test, np.asarray(y_hat).flatten())
            res[neighbor] = auc
            model.best_model = None # cleanup

        except Exception as e:
            print(e)
    return res


In [None]:
optimize_graph_results = []

for row in todo:
    if len(results) % 10 == 0:
        print(len(results))
    task = row["task"]
    graph_name = row["graph"]
    seed = row["seed"]
    model = row["model"]

    experiment = {
        "task": task.id,
        "model": model.name,
        "graph": graph_name,
        "seed": seed,
        "train_size": train_size,
        "optimize_graph_results": None,
    }

    try:
        X_train, X_test, y_train, y_test = sklearn.model_selection.\
            train_test_split(task.data, task.labels, stratify=task.labels, 
                             train_size=train_size, test_size=test_size)
    except ValueError:
        import pdb; pdb.set_trace()
        results = record_result(results, experiment, filename)
        continue

    X_train = X_train.copy()
    X_test = X_test.copy()

    try:
        gene_graph = graphs[graph_name]
        adj = np.asarray(nx.to_numpy_matrix(gene_graph.nx_graph))
        model.fit(X_train, y_train.astype("uint8"), adj=adj)

        x_test = Variable(torch.FloatTensor(np.expand_dims(X_test.values, axis=2)), requires_grad=False).float()
        if cuda:
            x_test = x_test.cuda()
        y_hat = model.predict(x_test)[:, 1].data.cpu().numpy()
        auc = sklearn.metrics.roc_auc_score(y_test.astype("uint8"), np.asarray(y_hat).flatten())
        model.best_model = None # cleanup
        experiment["auc"] = auc
    except Exception as e:
        import pdb; pdb.set_trace()
        print(e)
        
    results = record_result(results, experiment, filename)


  affinity='euclidean')
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **kwargs)
  affinity='euclidean')
