In [107]:
import sys
import time
import copy
from itertools import repeat
import data, data.gene_datasets
import sklearn, sklearn.model_selection, sklearn.metrics, sklearn.linear_model, sklearn.neural_network, sklearn.tree
import numpy as np
import matplotlib, matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import gene_inference, gene_inference.utils
from gene_inference.models import lr, mlp, decision_tree
from gene_inference.infer_genes import infer_gene, infer_all_genes, sample_neighbors
from data.graph import Graph
from models.models import CGN
from data.utils import split_dataset
import optimization as otim
from torch.autograd import Variable
import torch
import tensorflow
from analysis.metrics import record_metrics_for_epoch
from analysis.logger import Logger

%load_ext autoreload
%autoreload 2

# Goal: To get a comparison of the methods with CGN and different dropouts today


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
tcgatissue = data.gene_datasets.TCGATissue(data_dir='./genomics/TCGA/', data_file='TCGA_tissue_ppi.hdf5')
class Object(object):
    pass

opt = Object()
opt.seed = 0
opt.nb_class = None
opt.nb_examples = None
opt.nb_nodes = None
opt.graph = "pathway"
opt.dataset = tcgatissue

graph = Graph()
#path = "./genomics/graph/pancan-tissue-graph.hdf5"
path = "./genomics/graph/pancan-tissue-graph.hdf5"
graph.load_graph(path)
#graph.intersection_with(tcgatissue)
g = nx.from_numpy_matrix(graph.adj)
mapping = dict(zip(range(0, len(tcgatissue.df.columns)), tcgatissue.df.columns))
g = nx.relabel_nodes(g, mapping)

Converting one-hot labels to integers


In [3]:
import models, models.graphLayer
reload(models.graphLayer)
opt.add_self = True
opt.norm_adj = True
opt.num_layer = 2
opt.cuda = False
opt.add_connectivity = True
opt.pool_graph = "ignore"




In [4]:
adj_transform, aggregate_function = models.graphLayer.get_transform(opt, graph.adj)


In [113]:
num_channel = 4
num_layer = 1
nb_class = 2
model = CGN(
            nb_nodes=len(tcgatissue.df.columns), 
            input_dim=1,
            channels=[num_channel] * num_layer,
            adj=graph.adj,
            out_dim=nb_class,
            on_cuda=False,
            add_emb=None,
            transform_adj=adj_transform,
            aggregate_adj=aggregate_function,
            use_gate=True,
            dropout=True,
            )

Doing drop-out


In [114]:
def infer_gene(method, dataset, gene_to_infer, train_size, test_size, trials, penalty=False):
    mean = dataset.df[gene_to_infer].mean()
    dataset.labels = [1 if x > mean else 0 for x in dataset.df[gene_to_infer]]
    dataset.df = dataset.df.drop(gene_to_infer, axis=1)
 
    results = method(dataset, trials, train_size, test_size, penalty=penalty)

    data = {"gene_name": gene_to_infer,
            "auc": results[0],
            "std": results[1]
            }
    return pd.DataFrame(data, [0])

In [117]:
def cgn(dataset, trials, train_size, test_size, penalty=False, num_epochs=100):
    scores = []
    writer = Logger("./")
    labels = torch.LongTensor(dataset.labels)
    criterions = otim.get_criterion(dataset)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001)
    patience = 20
    max_valid = 0

    for trial in range(trials):
        train_set, valid_set, test_set = split_dataset(dataset, batch_size=10, random=True, train_ratio=0.33333, seed=trial, nb_samples=train_size + 2 * test_size, nb_per_class=None)
        for t in range(0, num_epochs):
            start_timer = time.time()
            for no_b, mini in enumerate(train_set):
                inputs, labels = mini['sample'], mini['labels']

                inputs = Variable(inputs, requires_grad=False).float()

                model.train()

                y_pred = model(inputs)

                # Compute and print loss
                crit_loss = otim.compute_loss(criterions, y_pred, labels)
                #model_regularization_loss = model.regularization(opt.model_reg_lambda)
                #l1_loss = setup_l1_loss(my_model, opt.l1_loss_lambda, l1_criterion, opt.cuda)
                #total_loss = crit_loss + model_regularization_loss + l1_loss
                total_loss = crit_loss

                # Zero gradients, perform a backward pass, and update the weights.
                optimizer.zero_grad()
                crit_loss.backward()
                optimizer.step()
                model.eval()
            time_this_epoch = time.time() - start_timer
            acc, auc = record_metrics_for_epoch(writer, crit_loss, total_loss, t, time_this_epoch, train_set, valid_set, test_set, model, dataset, cuda=True)

            summary = [
                t,
                crit_loss.data[0],
                acc['train'],
                acc['valid'],
                auc['train'],
                auc['valid'],
                time_this_epoch
            ]
            summary = "epoch {}, cross_loss: {:.03f}, acc_train: {:0.3f}, acc_valid: {:0.3f}, auc_train: {:0.3f}, auc_valid:{:0.3f}, time: {:.02f} sec".format(*summary)
            print summary

            patience = patience - 1
            if patience == 0:
                break
            if max_valid < auc['valid'] and t > 3:
                scores.append(auc['valid']) 
                break

    return np.round(np.mean(scores), 2),  np.round(np.std(scores), 2)


In [None]:
reload(data)
reload(models)
tcgatissue = data.gene_datasets.TCGATissue(data_dir='./genomics/TCGA/', data_file='TCGA_tissue_ppi.hdf5')
infer_gene(cgn, tcgatissue, "RPL5", train_size=100, test_size=100, trials=3, penalty=False)

Converting one-hot labels to integers
epoch 0, cross_loss: 0.824, acc_train: 0.545, acc_valid: 0.500, auc_train: 0.962, auc_valid:0.711, time: 0.87 sec
epoch 1, cross_loss: 2.019, acc_train: 0.747, acc_valid: 0.590, auc_train: 0.977, auc_valid:0.723, time: 0.81 sec
epoch 2, cross_loss: 5.505, acc_train: 0.667, acc_valid: 0.580, auc_train: 0.979, auc_valid:0.728, time: 0.79 sec
epoch 3, cross_loss: 1.729, acc_train: 0.828, acc_valid: 0.650, auc_train: 0.983, auc_valid:0.723, time: 0.82 sec
epoch 4, cross_loss: 0.731, acc_train: 0.899, acc_valid: 0.660, auc_train: 0.983, auc_valid:0.721, time: 0.80 sec
