In [34]:
import math
import argparse
import pandas as pd
import random

import matplotlib.pyplot as plt
import torch
import numpy as np

from torchvision import transforms, utils
from torch import nn 
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.functional as F

import import_ipynb
import metrics
import data_transformation
import train_validation
import model_architecture
import rank_model
import utils
import unittest
from sklearn.model_selection import train_test_split  

In [22]:
def test_modules(module_test):
    """Run unittest in imported files"""
    
    suite = unittest.TestLoader().loadTestsFromModule(module_test)
    unittest.TextTestRunner(verbosity=1).run(suite)

In [23]:
testmodules = [data_transformation.data_class_test(), metrics.metrics_test(), utils.utils_test()]
for test in testmodules:
    test_modules(test)

..
----------------------------------------------------------------------
Ran 2 tests in 28.736s

OK
..
----------------------------------------------------------------------
Ran 2 tests in 0.090s

OK
..
----------------------------------------------------------------------
Ran 2 tests in 0.157s

OK


In [24]:
params = {
    'batch_size': 32,
    'save_model_path': "conv3x3_7layer.chkpt",#'resnet18_02dropout.chkpt',#
    'epoch': 1,
    'model': "conv3x3",#'ResNet18',#
    'dropout': 0.7
}
# TODO add model description to checkpoint

In [35]:
data_class = data_transformation.data_transformation(path_data = "/data/data_curated_20180219/curated_training_data_no_mass_spec.csv",
                                                     path_mhc = "/data/aligned_mhc_dataset.csv",
                                                     allele_name = "HLA-A*02:01",
                                                     quant_data = True,
                                                     encoding = "one-hot")

pep, mhc, target = data_class.__getitem__()

pep = np.expand_dims(pep, axis=1)
mhc = np.expand_dims(mhc, axis=1)
inp = np.hstack((pep, mhc))
print(inp.shape)

X_train, X_test, y_train, y_test = train_test_split(inp, target.T, test_size=0.2, random_state=42)
model = model_architecture.conv3x3(inputchannel = np.size(X_train, 3),
                                   L = np.size(X_train, 2),
                                   dropout = 0.7,
                                   dropoutearly = 0.2)

pytorch_total_params = sum(p.numel() for p in model.parameters())
print("Model parameters: " + str(pytorch_total_params) )
if torch.cuda.device_count() > 0:
    print('Using GPU' + str(utils.pick_gpu_lowest_memory()))
    device = torch.device('cuda:' + str(utils.pick_gpu_lowest_memory()))
else:
    print('Using CPU')
    device = torch.device('cpu')
    
model = model.to(device)
criterion = metrics.select_criterion('MSE')
optimizer = optim.Adam(
        filter(lambda x: x.requires_grad, model.parameters()),
        betas=(0.9, 0.98), eps=1e-09, weight_decay = 0.0)


train_data = torch.utils.data.TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train))
eval_data = torch.utils.data.TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test))

train_dataloader = torch.utils.data.DataLoader(train_data,
                              batch_size = params['batch_size'],
                              shuffle = True, 
                              drop_last = True)
eval_dataloader = torch.utils.data.DataLoader(eval_data,
                              batch_size = params['batch_size'],
                              shuffle = True,
                              drop_last = True)

(11705, 2, 34, 20)
Model parameters: 88640
Using CPU


In [None]:
valid_losss, train_losses, valid_accus, train_accus, true_valid, pred_valid = train_validation.start_training(params['save_model_path'], params['epoch'], model,
                                                                                                              train_dataloader, eval_dataloader, optimizer,
                                                                                                              device, criterion)

In [None]:
valid_losss, train_losses, valid_accus, train_accus, true_valid, pred_valid = train_validation.continue_training(params['save_model_path'], params['epoch'], model, 
                                                                                                                 train_dataloader, eval_dataloader, optimizer, 
                                                                                                                 device, criterion)

## Predict new Values

In [26]:
def predict_mhc(allele_name):
    data_class = data_transformation.data_transformation(path_data = "/data/test_data_all_allele.csv",
                                                         path_mhc = "/data/aligned_mhc_dataset.csv",
                                                         allele_name = allele_name,
                                                         quant_data = True,
                                                         encoding = "one-hot")

    pep, mhc, target = data_class.__getitem__()

    pep = np.expand_dims(pep, axis=1)
    mhc = np.expand_dims(mhc, axis=1)
    inp = np.hstack((pep, mhc))
    print(inp.shape)

    if params['model'] == 'conv3x3':
      model = model_architecture.conv3x3(inputchannel = np.size(inp, 3),
                                   L = np.size(inp, 2),
                                   dropout = params['dropout'],
                                   dropoutearly = 0.2)
    elif params['model'] == 'convnet':
      model = model_architecture.convnet(L = np.size(inp, 2),
                                         I = np.size(inp, 3),
                                         dropout = params['dropout'])
    elif params['model'] == 'ResNet18':
      model = model_architecture.ResNet(inputchannel = np.size(inp, 3),
                                      block = model_architecture.BasicBlock, 
                                      num_blocks = [2,2,2,2],
                                      num_classes = 1,
                                      dropout = params['dropout'])

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("Model parameters: " + str(pytorch_total_params) )
    if torch.cuda.device_count() > 0:
        print('Using GPU' + str(utils.pick_gpu_lowest_memory()))
        device = torch.device('cuda:' + str(utils.pick_gpu_lowest_memory()))
    else:
        print('Using CPU')
        device = torch.device('cpu')

    model = model.to(device)
    criterion = metrics.select_criterion('MSE')
    optimizer = optim.Adam(
            filter(lambda x: x.requires_grad, model.parameters()),
            betas=(0.9, 0.98), eps=1e-09, weight_decay = 0.0)

    test_data = torch.utils.data.TensorDataset(torch.from_numpy(inp).float(), torch.from_numpy(target.T))

    test_dataloader = torch.utils.data.DataLoader(test_data,
                                  batch_size = 1,
                                  shuffle = False,
                                  drop_last = True)

    a, b = train_validation.prediction_only(params['save_model_path'], model, test_dataloader, optimizer, device, criterion)
    out_pred = np.concatenate(a).ravel()
    out_true = np.concatenate(b).ravel()
    mymodel_out = pd.DataFrame(list(zip(out_pred, out_true)), columns =['pred', 'true']).apply(lambda y: rank_model.reverse_log_transformation(y))

    flurry_data = pd.read_csv("C:/Users/paul_/OneDrive/Desktop/master-thesis/data/test_data_all_allele.csv")
    flurry_out = rank_model.mhcflurry_test(flurry_data, allele_name)
    
    return(rank_model.root_mean_squared(mymodel_out, allele_name, "my"), rank_model.root_mean_squared(flurry_out, allele_name, "flurry"))

In [27]:
mhc_names = ["HLA-A*01:01", "HLA-A*02:01", "HLA-B*07:02"]
combined_dict = {}
for i in mhc_names:
    my, flurry = predict_mhc(i)
    combined_dict.update(my)
    combined_dict.update(flurry)

(11, 2, 34, 20)
Model parameters: 88640
Using CPU
Checkpoint found and loaded - Predict values
[ Epoch 1 ]


                                                                                                                       

  - (Test) Elapse: 0.006236271063486735 min
(63, 2, 34, 20)
Model parameters: 88640
Using CPU
Checkpoint found and loaded - Predict values
[ Epoch 1 ]


                                                                                                                       

  - (Test) Elapse: 0.024543023109436034 min
(12, 2, 34, 20)
Model parameters: 88640
Using CPU
Checkpoint found and loaded - Predict values
[ Epoch 1 ]


                                                                                                                       

  - (Test) Elapse: 0.005466647942860921 min


In [28]:
combined_dict

{'HLA-A*01:01my': 9116.350263629933,
 'HLA-A*01:01flurry': 10153.670854486432,
 'HLA-A*02:01my': 26951.397619389187,
 'HLA-A*02:01flurry': 26411.98437729448,
 'HLA-B*07:02my': 8443.380269885078,
 'HLA-B*07:02flurry': 9349.510131293062}

In [29]:
a = np.mean([value for key, value in combined_dict.items() if 'my' in key.lower()])
a

14837.042717634731

In [30]:
b = np.mean([value for key, value in combined_dict.items() if 'flurry' in key.lower()])
b

15305.055121024658

In [33]:
(a/b - 1) * 100

-3.05789426884856

In [20]:
(a/b - 1) * 100

-4.276630118220703