# Train model using Mouse Brain dataset

In this notebook, we're going to train our model using the Mouse Brain dataset (GSE60361). 

This assumes that you've made the graph using the ```Infer GRN.ipynb``` code.

In [1]:
import os

import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Data, Dataset
from tqdm import tqdm
from datasets.datasetMouseBrain import MouseBrainDataset
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tqdm import tqdm
from sklearn.metrics import (auc, precision_recall_curve, roc_auc_score,
                             roc_curve)
from statistics import mean
from scipy.special import softmax
from torch_spline_conv import spline_conv

Torch version: 1.8.0+cu111
Cuda available: True
Torch geometric version: 2.0.3


Load up the dataset. Read ```datasetMouseBrain.py``` on how the dataset was built. 

In [2]:
dataset = MouseBrainDataset("/gpfs/data/rsingh47/hzaki1/data")

100%|██████████| 243075/243075 [00:09<00:00, 24873.96it/s]


In [3]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: MouseBrainDataset(3005):
Number of graphs: 3005
Number of features: 1

Data(x=[19972], edge_index=[2, 243075], y=[1])
Number of nodes: 19972
Number of edges: 243075
Average node degree: 12.17
Contains isolated nodes: True
Contains self-loops: False
Is undirected: False




In [4]:
torch.manual_seed(12345)
#dataset = dataset.shuffle()


# shuffle_index = np.loadtxt('shuffle_indices/shuffleIndex_MouseBrain.txt')
# shuffle_index = shuffle_index.astype(np.int32)
# train_size, val_size = int(len(shuffle_index)* 0.8), int(len(shuffle_index)* 0.9)
# train_dataset = [dataset[i] for i in shuffle_index[0:train_size]]
# val_dataset = [dataset[i] for i in shuffle_index[train_size: val_size]]
# test_dataset =  [dataset[i] for i in shuffle_index[val_size:]]

# train_dataset = torch.load('trainDataset.pt')
# test_dataset = torch.load('testDataset.pt')

train_dataset = dataset[:2403]
test_dataset = dataset[2403:]
#test_dataset = dataset[-12:]


# train_dataset = dataset[:40]
# test_dataset = dataset[40:60]

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 2403
Number of test graphs: 602


In [5]:
from torch_geometric.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
#val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)



In [6]:
from gcnmodel_sparseAttention import GCN_Sparse
from gcnmodel import GCN
from spgat import SPGAT

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GCN_Sparse(hidden_channels=128, data=dataset, output_size=7).to(device)
print(model)

GCN_Sparse(
  (attention_0): SpGraphAttentionLayer (1 -> 64)
  (lin): Linear(in_features=64, out_features=7, bias=True)
)


In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

    
def test(loader, size):
    model.eval()
    output = np.zeros((len(loader), size))
    actual = np.zeros((len(loader), size))
    accuracy = 0
    for ind, data in enumerate(loader):  # Iterate in batches over the training/test dataset.
        data.x = torch.reshape(data.x, (data.x.shape[0], 1))
        data.x = data.x.type(torch.FloatTensor)
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)
        output[ind] = softmax(out.cpu().detach().numpy())
        actual[ind][data.y] = 1
        accuracy += int((out.argmax(dim=1) == data.y).sum())
    all_labels = list(dataset.cellToIndex.keys())
    actual = np.array(actual)
    precision = dict()
    recall = dict()
    averageAUROC = []
    averageAUPR = []
    for (idx, c_label) in enumerate(all_labels):
        
        fpr, tpr, thresholds = roc_curve(actual[:,idx].astype(int), output[:,idx])
        precision[idx], recall[idx], _ = precision_recall_curve(actual[:, idx],
                                                        output[:, idx])
        averageAUROC.append(auc(fpr, tpr))
        averageAUPR.append(round(auc(recall[idx], precision[idx]),4))

    return accuracy/len(loader.dataset), mean(averageAUROC), mean(averageAUPR)


def train():
    model.train()
    avgLoss = 0
    for data in tqdm(train_loader, total=81):  # Iterate in batches over the training dataset.
        data.x = torch.reshape(data.x, (data.x.shape[0], 1))
        data.x = data.x.type(torch.FloatTensor)
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch)# Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.
        avgLoss += loss
    return avgLoss / 81


for epoch in range(1, 250):
    loss = train()
    train_acc, trainAUC, trainAUPR = test(train_loader, 7)
    test_acc,testAUC, testAUPR = test(test_dataset, 7)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Train AUC: {trainAUC:.4f}, Train AUPR: {trainAUPR:.4f}, Test Acc: {test_acc:.4f}, Test Auc: {testAUC:.4f}, Test AUPR: {testAUPR:.4f},  Loss: {loss:.4f}')

2403it [00:43, 54.65it/s]                      
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


RuntimeError: index.device().is_cuda() INTERNAL ASSERT FAILED at "csrc/cuda/scatter_cuda.cu":63, please report a bug to PyTorch. index must be CUDA tensor

In [None]:
torch.save(model.state_dict(), 'model_weightsFeb25_sparseattention.pth')

In [None]:
dataset[0].edge_index[0][83][2]

In [None]:
#from datasets.datasetMouseBrain import MouseBrainDataset
#dataset = MouseBrainDataset("/gpfs/data/rsingh47/hzaki1/data")

In [None]:
#data = dataset[0]
#data.edge_index.shape

In [None]:
torch.unique(dataset.adj[0])

In [None]:
dataset.adj.shape

In [None]:
def getAdj():
    """ 
    This will return a matrix / 2d array of the shape
    [Number of Nodes, Node Feature size]
    """
    adjacency = pd.read_csv(os.path.join(dataset.root,'raw/adjacencies.tsv'), sep='\t')

    dataset.filteredDF = adjacency[(adjacency['importance'] > (adjacency['importance'].mean() + adjacency['importance'].std()))]
    geneList = list(set(dataset.filteredDF['TF'].tolist() + dataset.filteredDF['target'].tolist()))
    geneToIndex = {}
    for i, gene in enumerate(geneList):
        geneToIndex[gene] = i

    counts = dataset.filteredDF.count().values[0]
    adjacencyMatrix = np.zeros((2, counts))
    print(adjacencyMatrix.shape)
    for index, row in tqdm(dataset.filteredDF.iterrows(), total=counts):
        adjacencyMatrix[0][index] = geneToIndex[row['TF']]
        adjacencyMatrix[1][index] = geneToIndex[row['target']]
    return torch.from_numpy(adjacencyMatrix).type(torch.LongTensor)

In [None]:
adj = getAdj()

In [None]:
dataset.filteredDF

In [None]:
dataset[0].edge_index[1645][7107]

In [None]:
adj.shape

In [None]:
print((dataset[0].edge_index == 0).nonzero(as_tuple=True))

In [None]:
data = dataset[0]

In [None]:
data.x = torch.reshape(data.x, (data.x.shape[0], 1))

In [None]:
data.x.t()