In [128]:
import sys
import time
import copy
from itertools import repeat
import data, data.gene_datasets
import sklearn, sklearn.model_selection, sklearn.metrics, sklearn.linear_model, sklearn.neural_network, sklearn.tree
import numpy as np
import matplotlib, matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import gene_inference, gene_inference.utils
from gene_inference.models import lr, mlp, decision_tree
from gene_inference.infer_genes import infer_gene, infer_all_genes, sample_neighbors
from data.graph import Graph
from models.models import CGN
from data.utils import split_dataset
import optimization as otim
from torch.autograd import Variable
import torch
import tensorflow
from analysis.metrics import record_metrics_for_epoch
from analysis.logger import Logger

%load_ext autoreload
%autoreload 2

# Goal: To get a comparison of the methods with CGN and different dropouts today


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
tcgatissue = data.gene_datasets.TCGATissue(data_dir='./genomics/TCGA/', data_file='TCGA_tissue_ppi.hdf5')
class Object(object):
    pass

opt = Object()
opt.seed = 0
opt.nb_class = None
opt.nb_examples = None
opt.nb_nodes = None
opt.graph = "pathway"
opt.dataset = tcgatissue

graph = Graph()
#path = "./genomics/graph/pancan-tissue-graph.hdf5"
path = "./genomics/graph/pancan-tissue-graph.hdf5"
graph.load_graph(path)
#graph.intersection_with(tcgatissue)
g = nx.from_numpy_matrix(graph.adj)
mapping = dict(zip(range(0, len(tcgatissue.df.columns)), tcgatissue.df.columns))
g = nx.relabel_nodes(g, mapping)

Converting one-hot labels to integers


In [3]:
import models, models.graphLayer
reload(models.graphLayer)
opt.add_self = True
opt.norm_adj = True
opt.num_layer = 2
opt.cuda = False
opt.add_connectivity = True
opt.pool_graph = "ignore"




In [4]:
adj_transform, aggregate_function = models.graphLayer.get_transform(opt, graph.adj)


In [113]:
num_channel = 4
num_layer = 1
nb_class = 2
model = CGN(
            nb_nodes=len(tcgatissue.df.columns), 
            input_dim=1,
            channels=[num_channel] * num_layer,
            adj=graph.adj,
            out_dim=nb_class,
            on_cuda=False,
            add_emb=None,
            transform_adj=adj_transform,
            aggregate_adj=aggregate_function,
            use_gate=True,
            dropout=True,
            )

Doing drop-out


In [124]:
def cgn(dataset, trials, train_size, test_size, penalty=False, num_epochs=100):
    scores = []
    writer = Logger("./")
    labels = torch.LongTensor(dataset.labels)
    criterions = otim.get_criterion(dataset)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001)
    patience = 20
    max_valid = 0

    for trial in range(trials):
        train_set, valid_set, test_set = split_dataset(dataset, batch_size=10, random=True, train_ratio=0.33333, seed=trial, nb_samples=train_size + 2 * test_size, nb_per_class=None)
        for t in range(0, num_epochs):
            start_timer = time.time()
            for no_b, mini in enumerate(train_set):
                inputs, labels = mini['sample'], mini['labels']

                inputs = Variable(inputs, requires_grad=False).float()

                model.train()

                y_pred = model(inputs)

                # Compute and print loss
                crit_loss = otim.compute_loss(criterions, y_pred, labels)
                #model_regularization_loss = model.regularization(opt.model_reg_lambda)
                #l1_loss = setup_l1_loss(my_model, opt.l1_loss_lambda, l1_criterion, opt.cuda)
                #total_loss = crit_loss + model_regularization_loss + l1_loss
                total_loss = crit_loss

                # Zero gradients, perform a backward pass, and update the weights.
                optimizer.zero_grad()
                crit_loss.backward()
                optimizer.step()
                model.eval()
            time_this_epoch = time.time() - start_timer
            acc, auc = record_metrics_for_epoch(writer, crit_loss, total_loss, t, time_this_epoch, train_set, valid_set, test_set, model, dataset, cuda=False)

            summary = [
                t,
                crit_loss.data[0],
                acc['train'],
                acc['valid'],
                auc['train'],
                auc['valid'],
                time_this_epoch
            ]
            summary = "epoch {}, cross_loss: {:.03f}, acc_train: {:0.3f}, acc_valid: {:0.3f}, auc_train: {:0.3f}, auc_valid:{:0.3f}, time: {:.02f} sec".format(*summary)
            print summary

            patience = patience - 1
            if patience == 0:
                break
            if max_valid < auc['valid'] and t > 3:
                scores.append(auc['valid']) 
                break

    return np.round(np.mean(scores), 2),  np.round(np.std(scores), 2)


In [134]:
reload(data)
reload(models)
tcgatissue = data.gene_datasets.TCGATissue(data_dir='./genomics/TCGA/', data_file='TCGA_tissue_ppi.hdf5')
cgn_df = infer_gene(cgn, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=False)
lr_df = infer_gene(lr, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=True)
mlp_df = infer_gene(mlp, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=True)


Converting one-hot labels to integers
epoch 0, cross_loss: 0.575, acc_train: 0.556, acc_valid: 0.510, auc_train: 0.999, auc_valid:0.858, time: 0.89 sec
epoch 1, cross_loss: 0.311, acc_train: 0.556, acc_valid: 0.520, auc_train: 1.000, auc_valid:0.850, time: 0.84 sec
epoch 2, cross_loss: 0.637, acc_train: 0.990, acc_valid: 0.740, auc_train: 1.000, auc_valid:0.841, time: 0.86 sec
epoch 3, cross_loss: 0.435, acc_train: 0.838, acc_valid: 0.630, auc_train: 1.000, auc_valid:0.837, time: 0.86 sec
epoch 4, cross_loss: 0.069, acc_train: 0.990, acc_valid: 0.710, auc_train: 1.000, auc_valid:0.830, time: 0.86 sec
epoch 0, cross_loss: 0.126, acc_train: 0.970, acc_valid: 0.760, auc_train: 0.997, auc_valid:0.831, time: 0.86 sec
epoch 1, cross_loss: 0.395, acc_train: 0.929, acc_valid: 0.720, auc_train: 0.998, auc_valid:0.842, time: 0.85 sec
epoch 2, cross_loss: 0.239, acc_train: 0.980, acc_valid: 0.750, auc_train: 1.000, auc_valid:0.857, time: 0.86 sec
epoch 3, cross_loss: 0.186, acc_train: 1.000, acc_



In [None]:
# Predict a gene from a growing number of Nodes
lr_results = pd.DataFrame([])
mlp_results = pd.DataFrame([])
cgn_results = pd.DataFrame([])
gene = "RPL5"
num_samples = 100
max_genes = 100
reload(data)
reload(models)
tcgatissue = data.gene_datasets.TCGATissue(data_dir='./genomics/TCGA/', data_file='TCGA_tissue_ppi.hdf5')
cgn_df = infer_gene(cgn, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=False)
lr_df = infer_gene(lr, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=True)
mlp_df = infer_gene(mlp, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=True)

for num_genes in range(10, max_genes, 10):
    neighbors = sample_neighbors(g, "RPL5", num_genes)
    tcgatissue.df = tcgatissue.df.loc[:, neighbors]
    lr_row = infer_gene(lr, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=True)
    lr_results = lr_results.append(lr_row).reset_index(drop=True)
    lr_results.loc[lr_results.index[-1], 'num_genes'] = num_genes
    cgn_row = infer_gene(cgn, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=True)
    cgn_results = cgn_results.append(cgn_row).reset_index(drop=True)
    cgn_results.loc[lr_results.index[-1], 'num_genes'] = num_genes
    tcgatissue = data.gene_datasets.TCGATissue(data_dir='./genomics/TCGA/', data_file='TCGA_tissue_ppi.hdf5')
    print num_genes
    print cgn_results
    print lr_results

Converting one-hot labels to integers
epoch 0, cross_loss: 0.949, acc_train: 0.778, acc_valid: 0.630, auc_train: 1.000, auc_valid:0.850, time: 0.86 sec
epoch 1, cross_loss: 0.057, acc_train: 1.000, acc_valid: 0.740, auc_train: 1.000, auc_valid:0.843, time: 0.77 sec
epoch 2, cross_loss: 0.019, acc_train: 1.000, acc_valid: 0.740, auc_train: 1.000, auc_valid:0.846, time: 0.80 sec
epoch 3, cross_loss: 0.057, acc_train: 1.000, acc_valid: 0.770, auc_train: 1.000, auc_valid:0.846, time: 0.80 sec
epoch 4, cross_loss: 0.074, acc_train: 1.000, acc_valid: 0.740, auc_train: 1.000, auc_valid:0.844, time: 0.79 sec
epoch 0, cross_loss: 0.250, acc_train: 1.000, acc_valid: 0.780, auc_train: 1.000, auc_valid:0.854, time: 0.80 sec
epoch 1, cross_loss: 0.104, acc_train: 1.000, acc_valid: 0.770, auc_train: 1.000, auc_valid:0.861, time: 0.82 sec
epoch 2, cross_loss: 0.276, acc_train: 0.990, acc_valid: 0.770, auc_train: 1.000, auc_valid:0.864, time: 0.80 sec
epoch 3, cross_loss: 0.083, acc_train: 1.000, acc_

In [137]:
# results of adding Nodes
plt.figure()

#full_results.loc[full_results['samples'] == 100]

line1 = plt.errorbar(results.index, results['auc'], xerr=0, yerr=results['std'])
line2 = plt.errorbar(results.index, list(repeat(0.69, len(full_results))), xerr=0, yerr=list(repeat(0.06, len(full_results))))

width = 0.2
plt.xticks(list(results.iloc[::5, :].index), results.iloc[::5, :]['num_genes'], rotation=70)
plt.title("Gene Inference with varying numbers of nodes")
plt.ylabel("AUC")
plt.xlabel("number of nodes")
plt.legend((line1[0], line2[0]), ('Varying # of Nodes', "Baseline (Full Dataset)"), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=10)

Unnamed: 0,auc,gene_name,std
0,0.86,RPL5,0.02
