In [None]:
%matplotlib  inline
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from copy import deepcopy

from modelv1 import Model    
from proteomics_preprocessing import *
from utils.proteomics_utils import *
from mhc_analysis_utils import *

# Data Requirements

### Swiss-Prot
Download and extract the desired Swiss-Prot release (we use 2018_10) from the UniProt ftp server. Save the contained uniprot_sprot.fasta in the ./data directory

### MHCFlurry18 [ODonnell2018]
Download curated MHC I dataset by [ODonnell2018] from https://data.mendeley.com/datasets/8pz43nvvxh/1/files/e1916ecf-b544-40e6-b1fe-e0024bea76a7/data_curated.20180219.tar.bz2?dl=1 and extract the archive in ./data

### IEDB16_I [Zhao2018]
Download the curated MHC I test data set by [Zhao2019] from https://doi.org/10.1371/journal.pcbi.1006457.s009 and extract the archive as well as the contained archives in the ./data directory

### HPV [Bonsack2019]
For the user's convenience the data from table S2 from Supplementary Material of [Bonsack2019] is provided as HPV_data.csv in ./git_data

### Kim14 [Kim2014]
Download the curated MHC I dataset by [Kim2014] from http://tools.iedb.org/static/main/benchmark_mhci_reliability.tar.gz and extract the archive in ./data

### Wang10 [Wang10]
Download the MHC II dataset prepared by [Wang2010] from http://tools.iedb.org/static/download/classII_binding_data_Nov_16_2009.tar.gz and extract the archive in ./data

###  IEDB16_II [Zhao2018]
Download the curated MHC I test data set by [Zhao2019] from https://doi.org/10.1371/journal.pcbi.1006457.s010 and extract the archive in the ./data directory


**References**
- [ODonnell2018] T. J. O’Donnell, A. Rubinsteyn, M. Bonsack, A. B. Riemer, U. Laserson, and J. Hammerbacher, “MHCflurry: Open-Source Class I MHC Binding Affinity Prediction,” Cell Systems, vol. 7, no. 1, pp. 129–132.e4, Jul. 2018. [Online].
Available: https://doi.org/10.1016/j.cels.2018.05.014
- [Zhao2018] W. Zhao and X. Sher, “Systematically benchmarking peptide-MHC binding predictors: From synthetic to naturally processed epitopes,” PLOS Computational Biology, vol. 14, no. 11, p. e1006457, Nov. 2018. [Online]. Available:https://doi.org/10.1371/journal.pcbi.1006457
- [Kim2014] Y. Kim, J. Sidney, S. Buus, A. Sette, M. Nielsen, and B. Peters, “Dataset size and composition impact the reliability of performance benchmarks for peptide-MHC binding predictions,” BMC Bioinformatics, vol. 15, no. 1, p. 241, 2014. [Online]. Available: https://doi.org/10.1186/1471-2105-15-241
- [Bonsack2019] M. Bonsack, S. Hoppe, J. Winter, D. Tichy, C. Zeller, M. D. Küpper, E. C. Schitter, R. Blatnik, and A. B. Riemer, “Performance Evaluation of MHC Class-I Binding Prediction Tools Based on an Experimentally Validated MHC–Peptide Binding Data Set,” Cancer Immunology Research, vol. 7, no. 5, pp. 719–736, Mar. 2019. [Online]. Available: https://doi.org/10.1158/2326-6066.cir-18-0584
- [Wang2010] P. Wang, J. Sidney, Y. Kim, A. Sette, O. Lund, M. Nielsen, and B. Peters, “Peptide binding predictions for HLA DR, DP and DQ molecules,” BMC Bioinformatics, vol. 11, no. 1, p. 568, 2010. [Online]. Available: https://doi.org/10.1186/1471-2105-11-568

# Data Preprocessing

## Binding affinity datasets
Create a directory for each dataset with subdirectories for each allele. The output of the preprocessing saved in each allele subdirectory is structured as follows:

- *tok.npy* sequences as list of numerical indices (mapping is provided by *tok_itos.npy*)
- *label.npy* label as list of binding affintiy values (mapping is provided by *label_itos.npy*)
- *train_IDs.npy/val_IDs.npy/test_IDs.npy* numerical indices identifying training/validation/test set by specifying rows in tok.npy
- *train_IDs_prev.npy/val_IDs_prev.npy/test_IDs_prev.npy* original non-numerical IDs for all entries that were ever assigned to the respective sets (used to obtain consistent splits for downstream tasks)
- *ID.npy* original non-numerical IDs for all entries in tok.npy

To ease the handling of a multitude of different alleles, they are ranked by the number of peptides present in the respective training dataset. Below, the allele subsirectories are therefore named "allelex" fot the xth allele in the ranking, except for the HPV dataset where the subdirectories are named after the alleles directly.

### IEDB16_I


In [None]:
dataset_path= Path("./reg_IEDB16_I")
dataset_path.mkdir(exist_ok=True)

prep = Preprocess()
for allele in np.arange(0,34).astype(int):
    prep.clas_mhc_i_zhao(allele, working_folder=dataset_path/"allele{}".format(allele),
                         pretrained_folder="../git_data/lm_netchop_peptides", train_set="MHCFlurry18")

### HPV

In [None]:
dataset_path= Path("./reg_HPV")
dataset_path.mkdir(exist_ok=True)

prep = Preprocess()
for allele in ['HLAA1','HLAA1', 'HLAA2', 'HLAA3', 'HLAA11', 'HLAA24', 'HLAB7', 'HLAB15']:
    prep.clas_mhc_i_hpv(allele, working_folder=dataset_path/allele, pretrained_folder="../git_data/lm_netchop_peptides", 
                        train_set="MHCFlurry18")

### Kim14

In [None]:
dataset_path= Path("./reg_Kim14")
dataset_path.mkdir(exist_ok=True)

prep = Preprocess()
for allele in np.arange(0,53).astype(int):
    prep.clas_mhc_kim(allele, working_folder=dataset_path/"allele{}".format(allele), 
                  pretrained_folder="../git_data/lm_netchop_peptides")

### IEDB16_II

In [None]:
dataset_path= Path("./reg_IEDB16_II")
dataset_path.mkdir(exist_ok=True)

prep = Preprocess()
for allele in range(0,24):
    prep.clas_mhc_ii(allele, working_folder=dataset_path/"allele{}".format(allele), 
                     pretrained_folder="../git_data/lm_netchop_peptides")

## NetChop sliced peptides form LM pretraining (optional)
To obtain protesome sliced sequences for language model pretraining, the following steps are taken:

1. Use NetChop from http://www.cbs.dtu.dk/services/NetChop/ to obtain cleavage sites for proteines from a *.fasta file 
    -**This can take a long time**
2. Slice proteines with the cleavage probability provided by NetChop
3. Tokenize the peptides
4. Perform train-validation-test set split

In [None]:
path_sprot_fasta = Path('../data/uniprot_sprot.fasta')
# filter for human proteome
df_sprot = parse_uniprot_fasta(path_sprot_fasta)
human_sprot = filter_human_proteome(df_sprot)
df_to_fasta(human_sprot, './data/uniprot_sprot_human.fasta')

In [None]:
# specify the path of the NetChop tcsh
netchop_path= '../../../software/NetChop/netchop-3.1/netchop'
# path for human proteome *.fasta file
protein_fasta_file = '../data/uniprot_sprot_human.fasta'


# Load and clean proteins, run NetChop as subprocess, slice peptides according to NetChop output, save tokenized peptides
load = Preprocess()
load.lm_netchop(working_folder="./lm_netchop_peptides", existing_netchop_peptides=None, netchop_path=netchop_path,
                protein_fasta_file=protein_fasta_file,
                netchop_min_length=8, netchop_max_length=20, netchop_repeats=30,
                ignore_clusters=True)

# Downstream Training and Evaluation

from_scratch is set to True for training from scratch and set to False for using a language model. By default we will use the provided language model that was pretrained on Netchop peptides (../git_data/lm_netchop_peptides)

The output is logged in logfile.log in the working directory. If eval_on_test and export_preds are set to True, the individual predictions on the test set are exported as preds_valid.npz.
If eval_on_val_test and export_preds are set to true, the validation test predictions are stored as preds_valid.npz and the test predictions are stored as preds_test.npz.

**CHOOSE A DATASET**

Choices: 
- IEDB16_I
- HPV
- Kim14
- IEDB16_II

In [None]:
# Choose a dataset:
dataset = "HPV"

assert dataset in ["IEDB16_I", "HPV", "Kim14", "IEDB16_II"], 'dataset not in ["IEDB16_I", "HPV", "Kim14", "IEDB16_II"]' 

data_dir = "./reg_{}".format(dataset)

### Single Model for one allele

From scratch

In [None]:
working_folder = data_dir+"/allele0" if dataset!="HPV" else data_dir+"/HLAA1"
modelv1=Model()

modelv1.generic_model(working_folder=working_folder, model_filename_prefix="fs_single",
                from_scratch=True,
                eval_on_test=True, export_preds=True,
                train=True,clas=True, regression=True, concat_train_val=True,
                emb_sz=50,nh=64,nl=1,
                 bs=32, epochs=10, lr=0.05,
                 wd=1e-7, dropout=0.1,
                 interactive=False,
                metrics=[])

with language model

In [None]:
working_folder = data_dir+"/allele0" if dataset!="HPV" else data_dir+"/HLAA1"
modelv1=Model()

modelv1.generic_model(working_folder=working_folder, model_filename_prefix="lm_single",
                from_scratch=False,pretrained_folder="../git_data/lm_netchop_peptides",pretrained_model_filename="lm_1l_3_enc",
                eval_on_test=True, export_preds=True,
                train=True,clas=True, regression=True, concat_train_val=True,
                emb_sz=50,nh=64,nl=1,
                 bs=32, epochs=10, lr=0.05,
                 wd=1e-7, dropout=0.1,
                 interactive=False,
                metrics=[])

### Ensemble
Train an ensemble of predictors for each allele.

To evaluate the ensembles, the exported predictions are collected for all alleles and stored as fs_ens_i.npz at after each ensemble member has been trained. 

From Scratch

In [None]:
modelv1=Model()

allele_dir_list = [os.path.join(data_dir, o) for o in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir,o))]
n_alleles = len(allele_dir_list)

for ensemble_i in range(10):
    for clas_folder in allele_dir_list:
        # Train
        # model_filename_prefix should end on the ensemble index ensemble_i
        modelv1.generic_model(working_folder=clas_folder, model_filename_prefix="fs_ens_{}".format(ensemble_i),
                        from_scratch=True,
                        eval_on_test=True, export_preds=True,
                        train=True,clas=True, regression=True, concat_train_val=True,
                        emb_sz=50,nh=64,nl=1,
                        bs=32, epochs=10, lr=0.05,
                        wd=1e-7, dropout=0.1,
                        interactive=False,
                       metrics=[])
        
    # be careful to give the correct preds_filename. If eval_on_test=True, the test predictions are saved as "preds_valid.npz",
    # if eval_on_val_test=True, the test predictions are saved as "preds_test.npz",
    # val_on_test=True, the test predictions are saved as "preds_valid.npz"
    if dataset=="HPV":
        collect_preds_npz(data_dir, n_alleles, subfoldername="", ensemble_i=ensemble_i, preds_filename='preds_valid.npz', 
                          ranked_alleles=False, allele_list=['HLAA1', 'HLAA24', 'HLAB7', 'HLAA3', 'HLAA11', 'HLAA2', 'HLAB15'])        
    else:
        collect_preds_npz(data_dir, n_alleles, subfoldername="allele", ensemble_i=ensemble_i, preds_filename='preds_valid.npz')

with Language Model

In [None]:
modelv1=Model()

allele_dir_list = [os.path.join(data_dir, o) for o in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir,o))]
n_alleles = len(allele_dir_list)

for ensemble_i in range(10):
    for clas_folder in allele_dir_list:
        # Train
        # model_filename_prefix should end on the ensemble index ensemble_i
        modelv1.generic_model(working_folder=clas_folder, model_filename_prefix="lm_ens_{}".format(ensemble_i),
                        from_scratch=False, 
                        pretrained_folder="../git_data/lm_netchop_peptides",pretrained_model_filename="lm_1l_3_enc",
                        eval_on_test=True, export_preds=True,
                        train=True,clas=True, regression=True, concat_train_val=True,
                        emb_sz=50,nh=64,nl=1,
                        bs=32, epochs=10, lr=0.05,
                        wd=1e-7, dropout=0.1,
                        interactive=False,
                       metrics=[])
    # be careful to give the correct preds_filename. If eval_on_test=True, the test predictions are saved as "preds_valid.npz",
    #  if eval_on_val_test=True, the test predictions are saved as "preds_test.npz",
    if dataset=="HPV":
        collect_preds_npz(data_dir, n_alleles, subfoldername="", ensemble_i=ensemble_i, preds_filename='preds_valid.npz', 
                          ranked_alleles=False, allele_list=['HLAA1', 'HLAA24', 'HLAB7', 'HLAA3', 'HLAA11', 'HLAA2', 'HLAB15'])        
    else:
        collect_preds_npz(data_dir, n_alleles, subfoldername="allele", ensemble_i=ensemble_i, preds_filename='preds_valid.npz')

# Evaluate

To evaluate the predictions after an ensemble of models has been trained, load the *.npz files and compile model names, predictions, targets, sequences and allele names in a DataFrame.

In [None]:
# load predictions for all alleles per ensemble
ensemble = []
for f in Path(data_dir).glob("*.npz"):
    print(f)
    df_tmp = read_npz(f, ensemble=True)
    ensemble.append(df_tmp)
    
# aggregate ensembles
ensemble = pd.concat(ensemble, ignore_index=True, sort=True)

columns = ['rank', 'ID', 'targs', 'inequality', 'model'] if dataset!="HPV" else ['rank', 'ID', 'targs', 'model']
agg_dic = {}

def first_arg(x):
    return x.iloc[0]

for c in columns:
    agg_dic[c] = first_arg
    
agg_dic['preds'] = np.mean

# keep single model predictions
single = ensemble.copy()[columns + ['preds']]
single = single.set_index(["model", "rank", "ID"])

# simply average predicitons for an ensemble predictor
ensemble["model"] = ensemble["model"].apply(lambda x: x[:-2])
ensemble= ensemble.groupby(["model", "rank", "ID"]).agg(agg_dic)
ensemble = ensemble.set_index(["model", "rank", "ID"])

result = pd.concat([ensemble, single], sort=True)

# Load tokenized sequences to merge with predictions
allele_dir_list = [os.path.join(data_dir, o) for o in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir,o))]
df_seq = []
for allele_dir in allele_dir_list:
    allele = int(allele_dir.split("/")[-1][6:]) if dataset!="HPV" else allele_dir.split("/")[-1]
    df_seq.append(load_tokenized_sequences(allele_dir,allele))
df_seq = pd.concat(df_seq, ignore_index=True)

result = result.reset_index()
if "rank" in result.columns and dataset!="HPV":
    result = result.astype({"rank": int})
result= result.merge(df_seq,how="inner", on=["rank","ID"])

# Get ranking and merge to restore allele names
if dataset!="HPV":
    ranking = load_ranking(dataset)
    print(result.shape)
    result = result.merge(ranking, how="inner", on="rank")
    print(result.shape)
result = result.set_index(["model", "rank", "ID"]).sort_index()

result.head()

In [None]:
print(dataset)
if dataset=="IEDB16_I":
    result["mer"] = result["sequence"].apply(len)
    result_9mer = result [result["mer"]==9]

    #overall:
    print("overall AUC ROC",result_9mer.groupby(level=0).apply(aucroc_ic50))
    print("\noverall Spearman r",result_9mer.groupby(level=0).apply(spearmanr_eval))
    #mean
    print("\nmean AUC ROC",result_9mer.groupby(level=[0,1]).apply(aucroc_ic50).groupby(level=0).mean())
    print("\nmean Spearman r",result_9mer.groupby(level=[0,1]).apply(spearmanr_eval).groupby(level=0).mean())
elif dataset=="Kim14" or dataset=="IEDB16_II":
    #overall:
    print("overall AUC ROC",result.groupby(level=0).apply(aucroc_ic50))
    print("\noverall Spearman r",result.groupby(level=0).apply(spearmanr_eval))
    #mean
    print("\nmean AUC ROC",result.groupby(level=[0,1]).apply(aucroc_ic50).groupby(level=0).mean())
    print("\nmean Spearman r",result.groupby(level=[0,1]).apply(spearmanr_eval).groupby(level=0).mean())
elif dataset=="HPV":
    #overall:
    print("overall AUC ROC", result.groupby(level=0).apply(aucroc_hpv))
    #mean:
    print("\nmean AUC ROC", result.groupby(level=[0,1]).apply(aucroc_hpv).groupby(level=0).mean())

# Language Model Training (optional- uses pretrained model by default)

Train a language model on the netchop sliced proteins generated above.

When interactive is set to True, the learning rate can be determined by a plot of the loss against the learning rate after training for one epoch (https://docs.fast.ai/callbacks.lr_finder.html).

In [None]:
lm_folder = "./lm_netchop_peptides"
modelv1=Model()

modelv1.generic_model(clas=False, 
                     working_folder=lm_folder, 
                     model_filename_prefix="lm",
                     emb_sz=50,nh=128,nl=1,
                     bs=128, epochs=20, lr=0.007,
                     dropout=0.5,
                     early_stopping="accuracy",
                     interactive=False)