### train

In [1]:
import torch
from torch.autograd import Function
import torch.nn as nn
import glob
import ntpath
import datatable as dt
import pandas as pd
import numpy as np
from functools import reduce
from torch.utils.data import Dataset
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch.nn.functional as F
import argparse as ap
import pickle
from collections import Counter
import os
from tensorboardX import SummaryWriter
from sklearn import metrics

In [2]:
def read_expr(path):
    expr = dt.fread(path, header=True, sep='\t', nthreads=6)
    expr = expr.to_pandas()
    expr.index = expr.loc[:, 'Gene']
    del expr['Gene']
    expr = expr.astype(float)
    return expr

def label2dic(label):
    label_set = list(set(label))
    dic = {}
    for i in range(len(label_set)):
        dic[label_set[i]] = i
    return dic

In [3]:
class Datasets(Dataset):
    def __init__(self, data, celltypes):
        class_labels = [ct_dic[i] for i in celltypes]
        self.class_labels = torch.as_tensor(class_labels)
        self.expr = data.values

    def __getitem__(self, index):
        return torch.as_tensor(self.expr[:, index]), self.class_labels[index]

    def __len__(self):
        return len(self.class_labels)

In [4]:
class net(nn.Module):
    def __init__(self, nfeatures, nct):
        super(net, self).__init__()
        self.nct = nct
        self.class_classifier = nn.Sequential(
            nn.Linear(in_features=nfeatures, out_features=250), 
            nn.ReLU(),
            nn.Dropout(), 
            nn.Linear(in_features=250, out_features=nct))
    def forward(self, input_data):
        class_logits = self.class_classifier(input_data)
        class_predictions = F.softmax(class_logits, dim=1)
        return class_logits

In [5]:
def train(train_data, celltypes, nfeatures, nct, device, lr, n_epoch):
    network = net(nfeatures, nct).train()
    train_data = Datasets(train_data, celltypes)
    lr = lr
    n_epoch = n_epoch
    batch_size = 256
    optimizer = torch.optim.Adam(network.parameters(), lr=lr)
    loss_class = nn.CrossEntropyLoss()
    network = network.to(device)
    loss_class = loss_class.to(device)
    train_loader = DataLoader(dataset=train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=True)

    len_train_loader = len(train_loader) 
    print('Begin training')
    for epoch in tqdm(range(n_epoch)):
        loader_iter = iter(train_loader)
        output_temp = []
        label_temp = []
        for i in range(len_train_loader):
            expr, class_label = loader_iter.next()
            expr = expr.to(device)
            expr = expr.float()
            class_label = class_label.to(device)
            class_output = network(input_data=expr)
            err_class = loss_class(class_output, class_label)
            output_temp.append(class_output.argmax(dim=1).cpu().numpy().tolist())
            label_temp.append(class_label.cpu())
        output_temp = [i for j in output_temp for i in j ]
        label_temp = [i for j in label_temp for i in j]
        acc = metrics.accuracy_score(label_temp, output_temp)
        writer.add_scalar('Loss/' + 'nfeature_' + str(nfeature) + '_ncell_' + str(ncell) + '_lr_' + str(lr) + '_epoch_' + str(n_epoch), err_class, epoch)
        writer.add_scalar('Acc/' + 'nfeature_' + str(nfeature) + '_ncell_' + str(ncell) +'_lr_' + str(lr) + '_epoch_' + str(n_epoch), acc, epoch)
        optimizer.zero_grad()
        err_class.backward()
        optimizer.step()
    print(acc)
    writer.close()      
    print('Finish Training')
    return network

In [6]:
class datasets(Dataset):
    def __init__(self, data):
        self.expr = data.values

    def __getitem__(self, index):
        return torch.as_tensor(self.expr[:, index])

    def __len__(self):
        return self.expr.shape[1]

In [7]:
def test(test_df, network, ct_dic):
    test_dat = datasets(test_df)
    pred_prob = []
    ct_dic_rev = {v: k for k, v in ct_dic.items()}
    test_loader = DataLoader(dataset=test_dat,
                             batch_size=test_df.shape[1],
                             shuffle=False)
    with torch.no_grad():
        pred_labels = []
        disease_labels = []
        for batch in test_loader:
            expr = batch
            expr = expr.float()
            expr = expr.to(device)
            class_output = network(expr)
            pred_labels.append(
                class_output.argmax(dim=1).cpu().numpy().tolist())
            pred_prob.append(F.softmax(class_output,dim=1).cpu().numpy())
        pred_labels = [ct_dic_rev[i] for item in pred_labels for i in item]
        pred_prob = pd.DataFrame(reduce(pd.concat, pred_prob))
        pred_prob.index = test_df.columns
        pred_prob.columns = ct_dic.keys()
        return pred_labels, pred_prob

In [8]:
nfeatures = [1000]
ncells = [100,1000]
device = torch.device('cuda:1')
lr = 0.0005

for nfeature in nfeatures:
    for ncell in ncells:
        if ncell==1000:
            n_epoch = 4000
        else:
            n_epoch = 2000 
        train_data = read_expr('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/train.txt')
        cts = pd.read_csv('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/ct.txt', header=None)
        cts = cts.iloc[:,0].to_numpy()
        ct_dic = label2dic(cts)
        nfeatures, nct = train_data.shape[0], len(ct_dic)
        writer = SummaryWriter('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/')
        network = train(train_data, cts, nfeatures, nct, device, lr, n_epoch)
        torch.save(network, '/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/' + 'lr_' + str(lr) + '_epoch_' + str(n_epoch) + '.pt')  
        test_data = read_expr('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/expr.txt')
        pred_labels, pred_prob = test(test_data, network, ct_dic)
        pd.DataFrame(pred_labels).to_csv('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/' +  'pred_labels.txt',index=False)
        pd.DataFrame(pred_prob).to_csv('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/' + 'pred_prob.txt',index=False)

Begin training


100%|███████████████████████████████████████| 2000/2000 [14:44<00:00,  2.26it/s]


0.9819196428571428
Finish Training
Begin training


100%|█████████████████████████████████████| 4000/4000 [5:30:06<00:00,  4.95s/it]


0.9522792022792023
Finish Training


### train

In [1]:
import torch
from torch.autograd import Function
import torch.nn as nn
import glob
import ntpath
import datatable as dt
import pandas as pd
import numpy as np
from functools import reduce
from torch.utils.data import Dataset
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch.nn.functional as F
import argparse as ap
import pickle
from collections import Counter
import os
from tensorboardX import SummaryWriter
from sklearn import metrics

In [2]:
def read_expr(path):
    expr = dt.fread(path, header=True, sep='\t', nthreads=6)
    expr = expr.to_pandas()
    expr.index = expr.loc[:, 'Gene']
    del expr['Gene']
    expr = expr.astype(float)
    return expr

def label2dic(label):
    label_set = list(set(label))
    dic = {}
    for i in range(len(label_set)):
        dic[label_set[i]] = i
    return dic

In [3]:
class Datasets(Dataset):
    def __init__(self, data, celltypes):
        class_labels = [ct_dic[i] for i in celltypes]
        self.class_labels = torch.as_tensor(class_labels)
        self.expr = data.values

    def __getitem__(self, index):
        return torch.as_tensor(self.expr[:, index]), self.class_labels[index]

    def __len__(self):
        return len(self.class_labels)

In [4]:
class net(nn.Module):
    def __init__(self, nfeatures, nct):
        super(net, self).__init__()
        self.nct = nct
        self.class_classifier = nn.Sequential(
            nn.Linear(in_features=nfeatures, out_features=1000), 
            nn.ReLU(),
            nn.Dropout(), 
            nn.Linear(in_features=1000, out_features=nct))
    def forward(self, input_data):
        class_logits = self.class_classifier(input_data)
        class_predictions = F.softmax(class_logits, dim=1)
        return class_logits

In [5]:
def train(train_data, celltypes, nfeatures, nct, device, lr, n_epoch):
    network = net(nfeatures, nct).train()
    train_data = Datasets(train_data, celltypes)
    lr = lr
    n_epoch = n_epoch
    batch_size = 256
    optimizer = torch.optim.Adam(network.parameters(), lr=lr)
    loss_class = nn.CrossEntropyLoss()
    network = network.to(device)
    loss_class = loss_class.to(device)
    train_loader = DataLoader(dataset=train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=True)

    len_train_loader = len(train_loader) 
    print('Begin training')
    for epoch in tqdm(range(n_epoch)):
        loader_iter = iter(train_loader)
        output_temp = []
        label_temp = []
        for i in range(len_train_loader):
            expr, class_label = loader_iter.next()
            expr = expr.to(device)
            expr = expr.float()
            class_label = class_label.to(device)
            class_output = network(input_data=expr)
            err_class = loss_class(class_output, class_label)
            output_temp.append(class_output.argmax(dim=1).cpu().numpy().tolist())
            label_temp.append(class_label.cpu())
        output_temp = [i for j in output_temp for i in j ]
        label_temp = [i for j in label_temp for i in j]
        acc = metrics.accuracy_score(label_temp, output_temp)
        writer.add_scalar('Loss/' + 'nfeature_' + str(nfeature) + '_ncell_' + str(ncell) + '_lr_' + str(lr) + '_epoch_' + str(n_epoch), err_class, epoch)
        writer.add_scalar('Acc/' + 'nfeature_' + str(nfeature) + '_ncell_' + str(ncell) +'_lr_' + str(lr) + '_epoch_' + str(n_epoch), acc, epoch)
        optimizer.zero_grad()
        err_class.backward()
        optimizer.step()
    print(acc)
    writer.close()      
    print('Finish Training')
    return network

In [6]:
class datasets(Dataset):
    def __init__(self, data):
        self.expr = data.values

    def __getitem__(self, index):
        return torch.as_tensor(self.expr[:, index])

    def __len__(self):
        return self.expr.shape[1]

In [7]:
def test(test_df, network, ct_dic):
    test_dat = datasets(test_df)
    pred_prob = []
    ct_dic_rev = {v: k for k, v in ct_dic.items()}
    test_loader = DataLoader(dataset=test_dat,
                             batch_size=test_df.shape[1],
                             shuffle=False)
    with torch.no_grad():
        pred_labels = []
        disease_labels = []
        for batch in test_loader:
            expr = batch
            expr = expr.float()
            expr = expr.to(device)
            class_output = network(expr)
            pred_labels.append(
                class_output.argmax(dim=1).cpu().numpy().tolist())
            pred_prob.append(F.softmax(class_output,dim=1).cpu().numpy())
        pred_labels = [ct_dic_rev[i] for item in pred_labels for i in item]
        pred_prob = pd.DataFrame(reduce(pd.concat, pred_prob))
        pred_prob.index = test_df.columns
        pred_prob.columns = ct_dic.keys()
        return pred_labels, pred_prob

In [8]:
nfeatures = [10000]
ncells = [100,1000]
device = torch.device('cuda:1')
lr = 0.0005

for nfeature in nfeatures:
    for ncell in ncells:
        if ncell == 100:
            n_epoch = 1000
        else:
            n_epoch = 3000
        train_data = read_expr('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/train.txt')
        cts = pd.read_csv('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/ct.txt', header=None)
        cts = cts.iloc[:,0].to_numpy()
        ct_dic = label2dic(cts)
        nfeatures, nct = train_data.shape[0], len(ct_dic)
        writer = SummaryWriter('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/')
        network = train(train_data, cts, nfeatures, nct, device, lr, n_epoch)
        torch.save(network, '/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/' + 'lr_' + str(lr) + '_epoch_' + str(n_epoch) + '.pt')  
        test_data = read_expr('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/expr.txt')
        pred_labels, pred_prob = test(test_data, network, ct_dic)
        pd.DataFrame(pred_labels).to_csv('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/' +  'pred_labels.txt',index=False)
        pd.DataFrame(pred_prob).to_csv('/home/renpf/HPV/res/doublet/nn/' + str(nfeature) + '_' + str(ncell) + '/' + 'pred_prob.txt',index=False)

Begin training


100%|███████████████████████████████████████| 1000/1000 [16:55<00:00,  1.02s/it]


0.9986607142857142
Finish Training
Begin training


100%|█████████████████████████████████████| 3000/3000 [9:55:46<00:00, 11.92s/it]


0.9821269586894587
Finish Training
