In [3]:
import math
import argparse
import pandas as pd
import random

import matplotlib.pyplot as plt
import torch
import numpy as np

from torchvision import transforms, utils
from torch import nn 
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.functional as F

import import_ipynb
import metrics
import data_transformation
import train_validation
import model_architecture
import rank_model
import utils
import unittest
from sklearn.model_selection import train_test_split  

In [None]:
def test_modules(module_test):
    """Run unittest in imported files"""
    
    suite = unittest.TestLoader().loadTestsFromModule(module_test)
    unittest.TextTestRunner(verbosity=1).run(suite)

In [None]:
testmodules = [data_transformation.data_class_test(), metrics.metrics_test(), utils.utils_test()]
for test in testmodules:
    test_modules(test)

In [2]:
params = {
    'batch_size': 32,
    'save_model_path': "resnet18_02dropout.chkpt",
    'epoch': 1,
    'model': "ResNet18",
    'dropout': 0.2
}
# TODO add model description to checkpoint

In [None]:
data_class = data_transformation.data_transformation(path_data = "/data/data_curated_20180219/curated_training_data_no_mass_spec.csv",
                                                     path_mhc = "/data/aligned_mhc_dataset.csv",
                                                     allele_name = "HLA-A*02:01",
                                                     quant_data = True,
                                                     encoding = "one-hot")

pep, mhc, target = data_class.__getitem__()

pep = np.expand_dims(pep, axis=1)
mhc = np.expand_dims(mhc, axis=1)
inp = np.hstack((pep, mhc))
print(inp.shape)

X_train, X_test, y_train, y_test = train_test_split(inp, target.T, test_size=0.2, random_state=42)

model = model_architecture.convnet(I = np.size(X_train, 3),
                                   L = np.size(X_train, 2),
                                   dropout = 0.45)

pytorch_total_params = sum(p.numel() for p in model.parameters())
print("Model parameters: " + str(pytorch_total_params) )
if torch.cuda.device_count() > 0:
    print('Using GPU' + str(utils.pick_gpu_lowest_memory()))
    device = torch.device('cuda:' + str(utils.pick_gpu_lowest_memory()))
else:
    print('Using CPU')
    device = torch.device('cpu')
    
model = model.to(device)
criterion = metrics.select_criterion('MSE')
optimizer = optim.Adam(
        filter(lambda x: x.requires_grad, model.parameters()),
        betas=(0.9, 0.98), eps=1e-09, weight_decay = 0.0)


train_data = torch.utils.data.TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train))
eval_data = torch.utils.data.TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test))

train_dataloader = torch.utils.data.DataLoader(train_data,
                              batch_size = params['batch_size'],
                              shuffle = True, 
                              drop_last = True)
eval_dataloader = torch.utils.data.DataLoader(eval_data,
                              batch_size = params['batch_size'],
                              shuffle = True,
                              drop_last = True)

In [None]:
valid_losss, train_losses, valid_accus, train_accus, true_valid, pred_valid = train_validation.start_training(params['save_model_path'], params['epoch'], model,
                                                                                                              train_dataloader, eval_dataloader, optimizer,
                                                                                                              device, criterion)

In [None]:
valid_losss, train_losses, valid_accus, train_accus, true_valid, pred_valid = train_validation.continue_training(params['save_model_path'], params['epoch'], model, 
                                                                                                                 train_dataloader, eval_dataloader, optimizer, 
                                                                                                                 device, criterion)

## Predict new Values

In [4]:
def predict_mhc(allele_name):
    data_class = data_transformation.data_transformation(path_data = "/data/test_data_all_allele.csv",
                                                         path_mhc = "/data/aligned_mhc_dataset.csv",
                                                         allele_name = allele_name,
                                                         quant_data = True,
                                                         encoding = "one-hot")

    pep, mhc, target = data_class.__getitem__()

    pep = np.expand_dims(pep, axis=1)
    mhc = np.expand_dims(mhc, axis=1)
    inp = np.hstack((pep, mhc))
    print(inp.shape)

    model = model_architecture.ResNet(inputchannel = np.size(inp, 3),
                                      block = model_architecture.BasicBlock, 
                                      num_blocks = [2,2,2,2],
                                      num_classes = 1,
                                      dropout = params['dropout'])

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("Model parameters: " + str(pytorch_total_params) )
    if torch.cuda.device_count() > 0:
        print('Using GPU' + str(utils.pick_gpu_lowest_memory()))
        device = torch.device('cuda:' + str(utils.pick_gpu_lowest_memory()))
    else:
        print('Using CPU')
        device = torch.device('cpu')

    model = model.to(device)
    criterion = metrics.select_criterion('MSE')
    optimizer = optim.Adam(
            filter(lambda x: x.requires_grad, model.parameters()),
            betas=(0.9, 0.98), eps=1e-09, weight_decay = 0.0)

    test_data = torch.utils.data.TensorDataset(torch.from_numpy(inp).float(), torch.from_numpy(target.T))

    test_dataloader = torch.utils.data.DataLoader(test_data,
                                  batch_size = 1,
                                  shuffle = False,
                                  drop_last = True)

    a, b = train_validation.prediction_only(params['save_model_path'], model, test_dataloader, optimizer, device, criterion)
    out_pred = np.concatenate(a).ravel()
    out_true = np.concatenate(b).ravel()
    mymodel_out = pd.DataFrame(list(zip(out_pred, out_true)), columns =['pred', 'true']).apply(lambda y: rank_model.reverse_log_transformation(y))

    flurry_data = pd.read_csv("C:/Users/paul_/OneDrive/Desktop/master-thesis/data/test_data_all_allele.csv")
    flurry_out = rank_model.mhcflurry_test(flurry_data, allele_name)
    
    return(rank_model.mean_squared(mymodel_out, allele_name, "my"), rank_model.mean_squared(flurry_out, allele_name, "flurry"))

In [5]:
mhc_names = ["HLA-A*01:01", "HLA-A*02:01", "HLA-B*07:02"]
combined_dict = {}
for i in mhc_names:
    my, flurry = predict_mhc(i)
    combined_dict.update(my)
    combined_dict.update(flurry)

(11, 2, 34, 20)
Model parameters: 11190657
Using CPU
Checkpoint found and loaded - Predict values
[ Epoch 1 ]


                                                                                                                       

  - (Test) Elapse: 0.01226200262705485 min


Using TensorFlow backend.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.



(63, 2, 34, 20)
Model parameters: 11190657
Using CPU
Checkpoint found and loaded - Predict values
[ Epoch 1 ]


                                                                                                                       

  - (Test) Elapse: 0.0748319427172343 min
(12, 2, 34, 20)
Model parameters: 11190657
Using CPU
Checkpoint found and loaded - Predict values
[ Epoch 1 ]


                                                                                                                       

  - (Test) Elapse: 0.013825551668802897 min


In [6]:
combined_dict

{'HLA-A*01:01my': 153330671.59871185,
 'HLA-A*01:01flurry': 103097031.82124723,
 'HLA-A*02:01my': 639035615.0296553,
 'HLA-A*02:01flurry': 697592918.7464478,
 'HLA-B*07:02my': 93661146.81511058,
 'HLA-B*07:02flurry': 87413339.6951516}

In [9]:
np.sqrt(np.mean([value for key, value in combined_dict.items() if 'my' in key.lower()]))

17185.531060007794

In [10]:
np.sqrt(np.mean([value for key, value in combined_dict.items() if 'flurry' in key.lower()]))

17205.651109086677