In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import random
import matplotlib.pyplot as plt

# Useful Links
Implementing data object for neural networks:
https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html

Writing a custom dataset class: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

Writing a GAT Class example: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/gat.py

Dataloaders for Batch Training: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

Splitting datasets for training and testing: https://pytorch.org/docs/stable/data.html#torch.utils.data.random_split

Batching with GNNs: https://github.com/pyg-team/pytorch_geometric/issues/973, https://github.com/gordicaleksa/pytorch-GAT/blob/main/The%20Annotated%20GAT%20(PPI).ipynb

In [2]:
scRNA_data = pd.read_csv('GSE200981_scRNAseq_processed.tsv', sep='\t')
scRNA_data.index = scRNA_data['Gene.names']
scRNA_data = scRNA_data.drop('Gene.names', axis=1)
len(scRNA_data)

26364

In [3]:
#Mapping string to protein names
string_api_url = "https://string-db.org/api"
output_format = "tsv-no-header"
method = "get_string_ids"

params = {

    "identifiers" : "\r".join(list(scRNA_data.index)), # your protein list
    "limit": 1,
    "echo_query": 1,
    "species" : 9606, # species NCBI identifier 
    "caller_identity" : "www.awesome_app.org" # your app name

}

request_url = "/".join([string_api_url, output_format, method])

results = requests.post(request_url, data=params)


protein_2_string = dict()
string_2_protein = dict()

for line in results.text.strip().split("\n"):
    l = line.split("\t")
    protein_identifier, string_identifier = l[0], l[2]
    protein_2_string[protein_identifier] = string_identifier
    string_2_protein[string_identifier] = protein_identifier

In [4]:
scRNA_data = scRNA_data.loc[list(protein_2_string.keys())]
scRNA_data

Unnamed: 0_level_0,V1_T0,V2_T0,V3_T0,V4_T0,V5_T0,V6_T0,V7_T0,V8_T0,V9_T0,V10_T0,...,V247_T7,V248_T7,V249_T7,V250_T7,V251_T7,V252_T7,V253_T7,V254_T7,V255_T7,V256_T7
Gene.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMD11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DAZ1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DAZ3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DAZ2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDY1B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch.nn as nn
from torch_geometric.data.batch import Batch
from torch_geometric.data import Data

In [6]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

In [7]:
%load_ext autoreload
from dataset import EMT_Dataset
dataset = EMT_Dataset(scRNA_data, string_2_protein)

Getting network tensor...


100%|██████████████████████████| 13715404/13715404 [00:05<00:00, 2330661.42it/s]


Getting node features tensor...


In [8]:
generator = torch.Generator().manual_seed(42)
train_length = int(0.7*len(dataset))
training_dataset, testing_dataset = random_split(dataset, [train_length, len(dataset)-train_length], generator=generator)

In [9]:
def graph_collate_fn(batch):
    num_nodes = batch[0][0].shape[1]
    node_features_list = []
    graph_list = []
    outputs_list = []
    
    for i, (node_features, graphs, outputs) in enumerate(batch):
        node_features_list.append(node_features)
        graph_list.append(graphs + num_nodes*i)
        outputs_list.append(outputs)
        
    node_features = torch.stack(node_features_list, dim=0)
    node_features = torch.reshape(node_features, (node_features.shape[0]*node_features.shape[1], node_features.shape[2]))
    graphs = torch.cat(graph_list, 1)
    #print(outputs_list)
    outputs = torch.stack(outputs_list, dim=0)
    
    #print(node_features.size(), graphs.size(), outputs.size())
    
    return node_features, graphs, outputs

In [10]:
training_dataloader = DataLoader(training_dataset, batch_size=2, shuffle=True, collate_fn = graph_collate_fn)
testing_dataloader = DataLoader(testing_dataset, batch_size=2, shuffle=True, collate_fn = graph_collate_fn)

In [11]:
for batch in training_dataloader:
    print(batch[0].size())
    break

torch.Size([37680, 1])


In [28]:
def get_tensor_batch(batch_num, num_nodes):
    tensor_batch = torch.tensor([i for i in range(batch_num) for _ in range(num_nodes)]).to(device)
    return tensor_batch

def GAT_train(model, dataloader, epochs, num_nodes, lr = 1e-8, weight_decay = 5e-4):
    torch.cuda.empty_cache()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCEWithLogitsLoss(reduction='mean')
    model.train()
    optimizer.zero_grad()
    all_losses = []
    prev_batch_num = None
    
    for _ in tqdm(range(epochs)):
        model = model.train()
        losses = []
        
        for batch in dataloader:
            batch_num = batch[0].shape[0] // num_nodes
            
            if prev_batch_num != batch_num:
                prev_batch_num = batch_num
                tensor_batch = get_tensor_batch(prev_batch_num, num_nodes)
            
            tensor_batch = torch.tensor([i for i in range(batch_num) for _ in range(num_nodes)]).to(device)
            node_features, graphs, outputs = batch
            #print(node_features)
            #for node_feature, graph, output in zip(node_features, graphs, outputs): 
            optimizer.zero_grad()
            
            graphs = graphs.to(device)
            node_features = node_features.to(device)
            outputs = outputs.to(device)
            out = model(node_features, graphs, tensor_batch)
            #print(out.size())
            loss = criterion(out, outputs)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
    plt.plot([i for i in range(1, (len(losses)+1))], losses)
    plt.xlabel('Number of Batches')
    plt.ylabel('Loss')
    plt.title('GATConv E/M classification Training') 

In [29]:
%reload_ext autoreload
from GAT import GAT
model = GAT(dataset.num_features(), 2, dataset.num_classes(), 1).to(device)

In [None]:
GAT_train(model, training_dataloader, 1000, scRNA_data.shape[0])

 42%|████████████████▋                       | 416/1000 [28:18<36:39,  3.77s/it]

In [None]:
def GAT_test(model, dataloader):
    test_losses = []
    fp, tp, fn, tn = 0, 0, 0, 0
    criterion = nn.MSELoss().to(device)
    
    for batch in tqdm(dataloader):
        node_features, graphs, outputs = batch
        for node_feature, graph, output in zip(node_features, graphs, outputs):
            graph = graph.to(device)
            node_feature = node_feature.to(device)
            output = output.to(device)
        
            out = model(node_feature, graph)
            
            loss = criterion(out, output)
        
            test_losses.append(loss.item())
        
    return sum(test_losses)/len(test_losses)

In [None]:
GAT_test(model, testing_dataloader)

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()

        self.linear1 = nn.Linear(in_channels, hidden_channels)
        self.linear2 = nn.Linear(hidden_channels, out_channels)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.softmax(x)
        return x

In [None]:
mlp = MLP(dataset.num_features(), 512, dataset.num_classes()).to(device)

In [None]:
def MLP_train(model, dataloader, epochs, lr = 1e-8, weight_decay = 5e-4):
    torch.cuda.empty_cache()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.MSELoss().to(device)
    model.train()
    optimizer.zero_grad()
    losses = []
    
    for _ in tqdm(range(epochs)):
        model = model.train()
        
        for batch in dataloader:
            node_features, _, outputs = batch
            
            #print(node_features.size())
            node_features = torch.reshape(node_features, (node_features.shape[0], node_features.shape[1])).to(device)
            outputs = outputs.to(device)
            #print(node_features)
            out = model(node_features)
            loss = criterion(out, outputs)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            
    plt.plot([i for i in range(1, (len(losses)+1))], losses)
    plt.xlabel('Number of Batches')
    plt.ylabel('Loss')
    plt.title('MLP E/M classification Training') 

In [None]:
MLP_train(mlp, training_dataloader, 1000)

In [None]:
def MLP_test(model, dataloader):
    test_losses = []
    fp, tp, fn, tn = 0, 0, 0, 0
    criterion = nn.MSELoss().to(device)
    
    for batch in tqdm(dataloader):
        node_features, _, outputs = batch
        node_features = torch.reshape(node_features, (node_features.shape[0], node_features.shape[1])).to(device)
        #print(outputs)
        outputs = outputs.to(device)
        
        out = model(node_features)
            
        loss = criterion(out, outputs)
        
        test_losses.append(loss.item())
        
    return sum(test_losses)/len(test_losses)

In [None]:
MLP_test(mlp, testing_dataloader)