# Evaluate Atac Seq Interpretation

In this notebook, we're going to train our model using the Mouse Brain dataset (GSE60361). 

This assumes that you've made the graph using the ```Infer GRN.ipynb``` code.

In [1]:
import os

import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Data, Dataset
from tqdm import tqdm
from datasets.datasetAtacSeqChromatin import AtacSeqChromatinDataset
from datasets.datasetAtacSeq import AtacSeqDataset
from scipy.special import softmax
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tqdm import tqdm
from sklearn.metrics import (auc, precision_recall_curve, roc_auc_score,
                             roc_curve)
from statistics import mean

Torch version: 1.8.0+cu111
Cuda available: True
Torch geometric version: 2.0.3
Torch version: 1.8.0+cu111
Cuda available: True
Torch geometric version: 2.0.3


## Atac Seq

In [2]:
datasetChromatin = AtacSeqChromatinDataset("/gpfs/data/rsingh47/hzaki1/atacseqdataChromatin")
datasetReg = AtacSeqDataset("/gpfs/data/rsingh47/hzaki1/atacseqdata")

100%|██████████| 103480/103480 [00:04<00:00, 20884.11it/s]
100%|██████████| 103480/103480 [00:05<00:00, 20659.65it/s]


In [3]:
import pickle
def save_obj(obj, name):
    with open('/gpfs/data/rsingh47/hzaki1/atacseqdata/obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open('/gpfs/data/rsingh47/hzaki1/atacseqdata/obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
def sortDict(dictionary):
    return {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}

In [5]:
interpretationChromatin = load_obj('interpretationAtacSeq_datasetaugmentation')

# interpretationReg = {
#     'BJ' : load_obj('interpretationBJ'),
#     'GM' : load_obj('interpretationGM'),
#     'H1' : load_obj('interpretationH1'),
#     'K562' : load_obj('interpretationK562')
# }

interpretationReg = load_obj('interpretationWoAtacSeq_datasetaugmentation')

In [6]:
def convertToIndicies(lis,dataset):
    toReturn = []
    for ele in lis:
        toReturn.append(dataset.geneToIndex[ele])
    return toReturn

In [7]:
import numpy as np

In [8]:
matChr = np.zeros((1047,18666,2))
for count,data in enumerate(datasetChromatin):
    matChr[count] = data.x

In [9]:
matReg = np.zeros((1047,18666))
for count,data in enumerate(datasetReg):
    matReg[count] = data.x

In [10]:
labels = np.zeros((1047))
for count,data in enumerate(datasetReg):
    labels[count] = data.y.item()

In [11]:
matChrFil = np.zeros((1047,10,2))

In [12]:
for index, cell in enumerate(matChr):
    cellType = datasetChromatin.indexToCell[labels[index]]
    matChrFil[index] = np.take(cell, convertToIndicies(list(sortDict(interpretationChromatin[cellType]).keys())[0:10], datasetChromatin), axis=0)
    
    

In [13]:
matRegFil = np.zeros((1047,10))

In [14]:
for index, cell in enumerate(matReg):
    cellType = datasetReg.indexToCell[labels[index]]
    matRegFil[index] = np.take(cell, convertToIndicies(list(sortDict(interpretationReg[cellType]).keys())[0:10], datasetReg),axis=0) 

In [15]:
shuffle_index = np.loadtxt('shuffle_indices/atacseqShuffleIndex.txt')
shuffle_index = shuffle_index.astype(np.int32)
train_size, val_size = int(len(shuffle_index)* 0.8), int(len(shuffle_index)* 0.9)
train_indices = shuffle_index[0:train_size]
val_indices = shuffle_index[train_size: val_size]
test_indices =  shuffle_index[val_size:]

In [15]:
# indiciesTrainTest = np.arange(1047)
# np.random.shuffle(indiciesTrainTest)

In [16]:
# train = indiciesTrainTest[0:835]
# test = indiciesTrainTest[835:]

In [16]:
training = np.take(matRegFil, train_indices, axis=0)
testing = np.take(matRegFil, test_indices, axis=0)
trainLabels = np.take(labels, train_indices)
testLabels = np.take(labels, test_indices)

In [17]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(training, trainLabels)

LogisticRegression()

In [18]:
score = logisticRegr.score(testing, testLabels)

In [19]:
score

0.9047619047619048

In [22]:
training = np.take(matChrFil, train_indices, axis=0)
testing = np.take(matChrFil, test_indices, axis=0)
trainLabels = np.take(labels, train_indices)
testLabels = np.take(labels, test_indices)

In [23]:
logisticRegr = LogisticRegression()
logisticRegr.fit(training[:,:,0], trainLabels)

LogisticRegression()

In [24]:
logisticRegr.score(testing[:,:,0], testLabels)

0.8095238095238095

In [25]:
logisticRegr = LogisticRegression()
logisticRegr.fit(training[:,:,1], trainLabels)

LogisticRegression()

In [26]:
logisticRegr.score(testing[:,:,1], testLabels)

0.9238095238095239

In [28]:
training = np.take(matReg, train_indices, axis=0)
testing = np.take(matReg, test_indices, axis=0)
trainLabels = np.take(labels, train_indices)
testLabels = np.take(labels, test_indices)

In [29]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(training, trainLabels)
score = logisticRegr.score(testing, testLabels)

In [30]:
score # full expressiono

0.9904761904761905

## Mouse Brain Interpretation (to test)

In [29]:
from datasets.datasetMouseBrain import MouseBrainDataset
datasetMB = MouseBrainDataset("/gpfs/data/rsingh47/hzaki1/data")

Torch version: 1.8.0+cu111
Cuda available: True
Torch geometric version: 2.0.3


100%|██████████| 243075/243075 [00:11<00:00, 21095.54it/s]


In [30]:
import pickle
def save_obj(obj, name):
    with open('obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [31]:
mbInt = load_obj('InterpretationDictNov22_mouseBrain')

In [32]:
datasetMB.cellToIndex

{'interneurons': 0,
 'pyramidal SS': 1,
 'pyramidal CA1': 2,
 'oligodendrocytes': 3,
 'microglia': 4,
 'endothelial-mural': 5,
 'astrocytes_ependymal': 6}

In [33]:
labelsMB = np.zeros((3005))
for count,data in enumerate(datasetMB):
    labelsMB[count] = data.y.item()

In [34]:
matMB = np.zeros((3005,20))

In [35]:
matMBfull = np.zeros((3005,19972))
for count,data in enumerate(datasetMB):
    matMBfull[count] = data.x

In [36]:
for index, cell in enumerate(matMBfull):
    cellType = datasetMB.indexToCell[labelsMB[index]]
    matMB[index] = np.take(cell, convertToIndicies(list(sortDict(mbInt[cellType]).keys())[0:20], datasetMB) )

In [37]:
indiciesTrainTest = np.arange(3005)
np.random.shuffle(indiciesTrainTest)

In [38]:
train = indiciesTrainTest[0:2405]
test = indiciesTrainTest[2405:]

In [39]:
trainingMB = np.take(matMB, train, axis=0)
testingMB = np.take(matMB, test, axis=0)
trainLabels = np.take(labelsMB, train)
testLabels = np.take(labelsMB, test)

In [40]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(trainingMB, trainLabels)

LogisticRegression()

In [41]:
score = logisticRegr.score(testingMB, testLabels)

In [42]:
score

0.9183333333333333

In [43]:
trainingFull = np.take(matMBfull, train, axis=0)
testingFull = np.take(matMBfull, test, axis=0)
trainLabels = np.take(labelsMB, train)
testLabels = np.take(labelsMB, test)

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(trainingFull, trainLabels)

logisticRegr.score(testingFull, testLabels)

#without key genes

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9783333333333334

## Baron Human

In [44]:
from datasets.datasetbaronhuman import BaronHumanDataset

datasetBH = BaronHumanDataset("/gpfs/data/rsingh47/hzaki1/data-baron-human")

Torch version: 1.8.0+cu111
Cuda available: True
Torch geometric version: 2.0.3


100%|██████████| 181366/181366 [00:08<00:00, 20972.83it/s]


In [45]:
import pickle
def save_obj(obj, name):
    with open('obj/' + name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [46]:
torch.manual_seed(12345)
datasetBH = datasetBH.shuffle()

matBHFull = np.zeros((8569,17499))
labelsBH = np.zeros((8569))
for count, data in enumerate(datasetBH):
    matBHFull[count] = data.x
    labelsBH[count] = data.y.item()

In [47]:
indiciesTrainTestBH = np.arange(8569)
np.random.shuffle(indiciesTrainTestBH)

In [48]:
trainBH = indiciesTrainTestBH[0:6855]
testBH = indiciesTrainTestBH[6855:]

In [63]:
# trainingBH = np.take(matBHFull, trainBH, axis=0)
# testingBH = np.take(matBHFull, testBH, axis=0)
trainingBH = matBHFull[0:6855]
testingBH = matBHFull[6855:]
trainLabelsBH = labelsBH[0:6855]
testLabelsBH = labelsBH[6855:]

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(trainingBH, trainLabelsBH)

logisticRegr.score(testingBH, testLabelsBH)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.09918319719953325

In [64]:
matBHFull.shape

(8569, 17499)

In [65]:
bhInt = load_obj('InterpretationDictNov20_baronhuman')

In [66]:
matBH = np.zeros((8569,10))

In [67]:
for index, cell in enumerate(matBHFull):
    cellType = datasetBH.indexToCell[labelsBH[index]]
    matBH[index] = np.take(cell, convertToIndicies(list(sortDict(bhInt[cellType]).keys())[0:10], datasetBH) )

In [68]:
trainingBHFil = matBH[0:6855]
testingBHFil = matBH[6855:]

In [69]:
logisticRegr = LogisticRegression()
logisticRegr.fit(trainingBHFil, trainLabelsBH)

logisticRegr.score(testingBHFil, testLabelsBH)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.09918319719953325