In [None]:
# import libraries
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import csv
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, f1_score
import pickle
import zipfile
import io
import os.path

import tensorflow as tf
print("tf.__version__ =", tf.__version__)
from tensorflow.keras.layers import Masking, Embedding, Bidirectional, LSTM
from tensorflow.keras.layers import Input, Flatten, Dense, TimeDistributed, AveragePooling1D, Activation, Dropout, Concatenate
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import matplotlib
from matplotlib import pyplot
%matplotlib inline
from IPython import display
display.set_matplotlib_formats('svg')

aa_list = ['_PAD',
           'A',
           'R',
           'N',
           'D',
           'C',
           'E',
           'Q',
           'G',
           'H',
           'I',
           'L',
           'K',
           'M',
           'F',
           'P',
           'S',
           'T',
           'W',
           'Y',
           'V',
          ]
vocab_size = len(aa_list)
aa2index = {}
index2aa = {}
for index, aa in enumerate(aa_list):
    aa2index[aa] = index
    index2aa[index] = aa

IEDB_response_code = {'Positive': 1,
                      'Positive-High': 1,
                      'Positive-Intermediate': 1,
                      'Positive-Low': 1,
                      'Negative': 0,
                     }
MAX_LEN = 15

In [None]:
# define patient, hla_alleles, running mode, T cell tables
# comment or uncomment the lines below to select the dataset of interest
# the default dataset is mel15

tcell_table_file = "tcell_table_export_1665561938.csv"
hla_table_file = "mhc_ligand_full.csv.hla_1.host_human.csv"

mode = 'evaluation' # this mode is for the datasets in the DeepImmun manuscript
patient = "mel15"
hla_alleles = ["HLA-A*03:01", "HLA-A*68:01", "HLA-B*27:05", "HLA-B*35:03", "HLA-C*02:02", "HLA-C*04:01"]
# patient = "mel0D5P"
# hla_alleles = ["HLA-A*01:01", "HLA-A*23:01", "HLA-B*07:02", "HLA-B*15:01", "HLA-C*12:03", "HLA-C*14:02"]
# patient = "mel51"
# hla_alleles = ["HLA-A*01:01", "HLA-A*02:01", "HLA-B*14:02", "HLA-B*15:01", "HLA-C*03:04", "HLA-C*08:02"]
# patient = "iSPC2_IFN"
# hla_alleles = ["HLA-A*01:01", "HLA-A*02:01", "HLA-B*15:03", "HLA-B*53:01", "HLA-C*02:10", "HLA-C*04:01"]
# patient = "mouseEL4"
# hla_alleles = ["H2-Db", "H2-Kb"]
# patient = "L011"
# hla_alleles = ['HLA-A*11:01','HLA-A*24:02','HLA-B*35:01','HLA-B*49:01']
# patient = "L012"
# hla_alleles = ['HLA-A*11:01','HLA-A*24:02','HLA-B*07:02']
# patient = "L013"
# hla_alleles = ['HLA-A*02:01','HLA-A*32:01','HLA-B*15:01','HLA-B*44:02']
# patient = "CTE0010"
# hla_alleles = ['HLA-A*02:01','HLA-A*31:01','HLA-B*15:01','HLA-B*35:02','HLA-C*03:03','HLA-C*04:01']
# patient = "CTE0015"
# hla_alleles = ['HLA-A*11:01','HLA-A*02:14','HLA-B*07:02','HLA-B*27:05','HLA-C*02:02','HLA-C*07:02']
# patient = "1024002"
# hla_alleles = ['HLA-A*68:01','HLA-B*40:02','HLA-B*40:27','HLA-C*03:04']
# patient = "CU04"
# hla_alleles = ['HLA-A*24:26','HLA-A*26:01','HLA-B*18:01','HLA-B*38:01','HLA-C*12:03']
# patient = "ott1"
# hla_alleles = ['HLA-A*02:01','HLA-A*24:02','HLA-B*44:02','HLA-B*15:01']
# patient = "ott2"
# hla_alleles = ['HLA-A*01:01','HLA-B*38:01','HLA-B*56:01']
# patient = "ott3"
# hla_alleles = ['HLA-A*02:01','HLA-A*03:01','HLA-B*47:01','HLA-B*27:05']
# patient = "ott4"
# hla_alleles = ['HLA-A*02:01','HLA-A*25:01','HLA-B*18:01','HLA-B*27:02']
# patient = "ott5"
# hla_alleles = ['HLA-A*66:01','HLA-A*23:01','HLA-B*41:02','HLA-B*35:01']
# patient = "ott6"
# hla_alleles = ['HLA-A*66:01','HLA-A*01:03','HLA-B*08:01']
# patient = "rajasagi1"
# hla_alleles = ['HLA-A*33:01','HLA-A*68:12','HLA-B*35:01','HLA-B*14:01']

print("tcell_table_file =", tcell_table_file)
print("hla_table_file =", hla_table_file)
print("mode =", mode)
print("patient =", patient)
print("hla_alleles =", hla_alleles)
print()


In [None]:
##########################################################
##########################################################
##########################################################
# The code below is organized in this order:
#   TRAINING, PREDICTION, EVALUATION
# If you only want to do prediction or evaluation,
# skip the TRAINING and jump to PREDICTION or EVALUATION
##########################################################
##########################################################
##########################################################

In [None]:
##########################################################
##########################################################
##########################################################
# TRAINING
##########################################################
##########################################################
##########################################################

In [None]:
# read IEDB tables; select HLA alleles, linear peptides; and prepare training data
print("Read IEDB T cell assay table")
print("tcell_table_file =", tcell_table_file)

def prepare_training_data():

    # read tcell assays table
    assay_dict = {}
    with open(tcell_table_file, 'r') as input_handle:
        csv_reader = csv.reader(input_handle, delimiter=',')
        header_1 = next(csv_reader)
        header_2 = next(csv_reader)
        header_list = []
        for x, y in zip(header_1, header_2):
            header_list.append(':'.join([x, y]))
        for row in csv_reader:
            assert len(row) == len(header_list)
            assay = {}
            for x, y in zip (header_list, row):
                assay[x] = y
            assay_id = assay['Reference:T Cell ID']
            assay_dict[assay_id] = assay
    print("len(assay_dict) =", len(assay_dict))

    print("patient =", patient)
    print("hla_alleles =", hla_alleles)
    assay_filtered = []
    allele_freq_dict = {}
    iedb_pair_list = []
    for assay in assay_dict.values():
        allele = assay['MHC:Allele Name']
        epitope_type = assay['Epitope:Object Type']
        peptide = assay['Epitope:Description']
        response = IEDB_response_code[assay['Assay:Qualitative Measure']]
        if not (allele in hla_alleles and epitope_type == "Linear peptide" and all([aa in aa_list for aa in peptide])):
            continue
        assay_filtered.append(assay)
        if allele in allele_freq_dict:
            allele_freq_dict[allele] += 1
        else:
            allele_freq_dict[allele] = 1
        iedb_pair_list.append([peptide, response])

    print("len(assay_filtered) =", len(assay_filtered))
    print("allele_freq_dict =", allele_freq_dict)
    print("len(iedb_pair_list) = ", len(iedb_pair_list))
    print("  positive = ", len([y for x, y in iedb_pair_list if y == 1]))
    print("  negative = ", len([y for x, y in iedb_pair_list if y == 0]))
    print("iedb_pair_list[0] = ", iedb_pair_list[0])
    print()
    
    return iedb_pair_list

iedb_pair_list = prepare_training_data()


In [None]:
# add self peptides and/or allele peptides; select personalized model; prepare training_peptides
iedb_pair_list_pos = [[x, y] for x, y in iedb_pair_list if y == 1]
print("len(iedb_pair_list_pos) ", len(iedb_pair_list_pos))
iedb_pair_list_neg = [[x, y] for x, y in iedb_pair_list if y == 0]
print("len(iedb_pair_list_neg) ", len(iedb_pair_list_neg))
print()

if mode == 'evaluation':
    normal_hla_file = 'self peptides/' + patient + "_normal_hla.txt" # "mel15_normal_hla.txt"
print("normal_hla_file =", normal_hla_file)
if os.path.isfile(normal_hla_file):
    with open(normal_hla_file, 'r') as input_handle:
        normal_hla = input_handle.readlines()
        normal_hla = [x.strip() for x in normal_hla]
    normal_hla_neg = [[x, 0] for x in normal_hla]
else:
    normal_hla_neg = []
print("len(normal_hla_neg) =", len(normal_hla_neg))
print()

print("Read IEDB HLA assay table")
print("hla_table_file =", hla_table_file)
allele_peptides = {}
with open(hla_table_file) as f:
    csv_reader = csv.DictReader(f)
    for row in csv_reader:
        peptide = row['Epitope:Description']
        allele = row['MHC:Allele Name']
        if not (allele in hla_alleles and all([aa in aa_list for aa in peptide])):
            continue
        if allele in allele_peptides:
            allele_peptides[allele].add(peptide)
        else:
            allele_peptides[allele] = set(peptide)
for allele in allele_peptides:
    print(allele, len(allele_peptides[allele]))
allele_peptides = [x for l in allele_peptides.values() for x in l]
print("len(allele_peptides) =", len(allele_peptides))
allele_peptides_unique = sorted(list(set(allele_peptides)))
print("len(allele_peptides_unique) =", len(allele_peptides_unique))
allele_peptides_neg= [[x, 0] for x in allele_peptides_unique]
print("len(allele_peptides_neg) =", len(allele_peptides_neg))
print()

# use self peptides as negative, if they are available
if normal_hla_neg:
    training_pair_list = iedb_pair_list_pos + normal_hla_neg
    print("training_pair_list = iedb_pair_list_pos + normal_hla_neg")
else: # use allele-matched hla peptides from IEDB as negative
    training_pair_list = iedb_pair_list_pos + allele_peptides_neg
    print("training_pair_list = iedb_pair_list_pos + allele_peptides_neg")
# combine self and allele peptides as negative
# training_pair_list = iedb_pair_list_pos + normal_hla_neg + allele_peptides_neg
# print("training_pair_list = iedb_pair_list_pos + normal_hla_neg + allele_peptides_neg")
print("len(training_pair_list) ", len(training_pair_list))
print()

# reduce training data to peptides rather than assays 
print("Reduce training_pair_list to unique peptides")
peptide_set = set([x for x, y in training_pair_list])
peptide_set_pos = set([x for x, y in training_pair_list if y == 1])
peptide_set_neg = set([x for x, y in training_pair_list if y == 0])
peptide_overlap = peptide_set_pos.intersection(peptide_set_neg)
print("len(peptide_set) = ", len(peptide_set))
print("  len(peptide_set_pos) = ", len(peptide_set_pos))
print("  len(peptide_set_neg) = ", len(peptide_set_neg))
print("  len(peptide_overlap) = ", len(peptide_overlap))
# convert set to sorted list to maintain the abc order to remove the randomness of the set data type
peptide_set_pos = sorted(list(peptide_set_pos))
peptide_set_neg = sorted(list(peptide_set_neg))
# remove overlap peptides from the negative set
training_peptides = [[x, 1] for x in peptide_set_pos] + [[x, 0] for x in peptide_set_neg if x not in peptide_overlap]
print("Remove overlap peptides from the negative set")
print("len(training_peptides) = ", len(training_peptides))
print("  positive = ", len([y for x, y in training_peptides if y == 1]))
print("  negative = ", len([y for x, y in training_peptides if y == 0]))
print()


In [None]:
# exclude the evaluation data from the training data
print("mode =", mode)
if mode == 'evaluation':
    print("len(training_peptides) =", len(training_peptides))
    mutated_test_file = "test_csv/test." + patient + ".csv" # "test.mel15.csv"
    print("mutated_test_file =", mutated_test_file)
    mutated_test_list = set()
    if mutated_test_file[-3:] == 'csv':
        with open(mutated_test_file, 'r') as csv_handle:
            csv_reader = csv.DictReader(csv_handle)
            mutated_test_list = [row['peptide'] for row in csv_reader]
    elif mutated_test_file[-3:] == 'txt':
        with open(mutated_test_file, 'r') as file:
            mutated_test_list= [x.strip() for x in file.readlines()]
    print("len(mutated_test_list) =", len(mutated_test_list))
    training_overlap_mutated_test = [x for x, y in training_peptides if x in mutated_test_list]
    print("len(training_overlap_mutated_test) =", len(training_overlap_mutated_test))
    print("training_overlap_mutated_test =", training_overlap_mutated_test)
    print("Exclude training_overlap_mutated_test")
    training_peptides = [[x, y] for x, y in training_peptides if x not in training_overlap_mutated_test]
    print("len(training_peptides) =", len(training_peptides))
    print("  positive = ", len([y for x, y in training_peptides if y == 1]))
    print("  negative = ", len([y for x, y in training_peptides if y == 0]))
    print("training_peptides[0] = ", training_peptides[0])
    print()


In [None]:
# filter length_8_14

print("len(training_peptides) = ", len(training_peptides))
print()

print("Filter length_8_14")
training_peptides = [[x, y] for x, y in training_peptides if len(x) >= 8 and len(x) <= 14]
print("len(training_peptides) = ", len(training_peptides))
print("  positive = ", len([y for x, y in training_peptides if y == 1]))
print("  negative = ", len([y for x, y in training_peptides if y == 0]))
print("training_peptides[0] = ", training_peptides[0])
print()


In [None]:
# split train/valid/test and prepare tensors
print("len(training_peptides) = ", len(training_peptides))
train_valid_set, test_set = train_test_split(training_peptides, test_size=0.1, random_state=99)
train_set, valid_set = train_test_split(train_valid_set, test_size=0.1, random_state=99)
train_set_neg = [[x, y] for x, y in train_set if y == 0]
train_set_pos = [[x, y] for x, y in train_set if y == 1]
print("len(train_set) = ", len(train_set))
print("  positive = ", len(train_set_pos))
print("  negative = ", len(train_set_neg))
print("train_set[0] = ", train_set[0])
print("train_set[-1] = ", train_set[-1])
print("len(valid_set) = ", len(valid_set))
print("len(test_set) = ", len(test_set))
print()

def prepare_tensor(training_set):
    x_peptide = [x for x, y in training_set]
    x_tensor = [[aa2index[aa] for aa in peptide] for peptide in x_peptide]
    x_tensor = tf.keras.preprocessing.sequence.pad_sequences(
      x_tensor, 
      maxlen=MAX_LEN,
      dtype='int32',
      padding='post',
      value=0)
    y_tensor = np.array([y for x, y in training_set])
    return x_tensor, y_tensor

x_valid, y_valid = prepare_tensor(valid_set)
x_test, y_test = prepare_tensor(test_set)


In [None]:
# model training 
# This may take long time so it's better to start with 10 models instead of 100 as in the manuscript.
# Each model takes about 1-3 minutes to train, depending on the size of the training data of a patient.

def train_model(x_train, y_train, x_valid, y_valid, model_path, num_epochs):
#     print("".join(["="] * 80)) # section-separating line
#     print("train_model()")

    # Model Training
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=8, mask_zero=True))
    model.add(Bidirectional(LSTM(8, recurrent_initializer='glorot_uniform')))
    model.add(Dense(1, kernel_regularizer=regularizers.l2(0.01)))
    # dropout causes fluctuations ??
    #model.add(Dropout(0.5))
    #model.add(Dense(1))
    model.add(Activation('sigmoid'))
    #print(model.summary())
    #model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['binary_accuracy'])
    model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
    model_checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True)
    history = model.fit(x_train, y_train,
                        batch_size=32,
                        epochs=num_epochs,
                        validation_data=(x_valid, y_valid),
                        verbose=0,
                        callbacks=[model_checkpoint])
    
#     fig, ax = pyplot.subplots(1, 2)
#     ax[0].plot(history.history['loss'])
#     ax[0].plot(history.history['val_loss'])
#     ax[0].set_ylabel('loss')
#     ax[0].set_xlabel('epoch')
#     ax[0].legend(['loss', 'val_loss'], loc='upper left')
#     ax[1].plot(history.history['auc'])
#     ax[1].plot(history.history['val_auc'])
#     ax[1].set_ylabel('auc')
#     ax[1].set_xlabel('epoch')
#     ax[1].legend(['auc', 'val_auc'], loc='upper left')
    
    return model

# model training
print("model training")
model_name = "models/" + patient + "_copy"
model_number = 10
model_paths = [model_name + "/" + "model" + "_" + patient + "_copy" + "_" + str(m) + ".h5" for m in range(model_number)]
model_score_path = model_name + "/" + "model_score" + ".pkl"
print("len(model_paths) =", len(model_paths))
print("model_paths[0] =", model_paths[0])
print("model_score_path =", model_score_path)
print()
num_epochs = 100
for i, path in enumerate(model_paths):
    
    if os.path.isfile(path):
        continue
    
    print("path =", path)
    
    random.seed(99 + i)
    
#     # no sampling
#     train_set = train_set_neg + train_set_pos

    # downsampling negative samples
    train_set_neg_down = random.sample(train_set_neg, k=min(len(train_set_pos), len(train_set_neg)))
    train_set = train_set_neg_down + train_set_pos

#     # downsampling positive samples
#     train_set_pos_down = random.sample(train_set_pos, k=len(train_set_neg))
#     train_set = train_set_pos_down + train_set_neg
    
    random.shuffle(train_set)
    x_train, y_train = prepare_tensor(train_set)
    
    # model training
    model = train_model(x_train, y_train, x_valid, y_valid, path, num_epochs)


In [None]:
# make predictions on valid set and save the results for model selection
if not os.path.isfile(model_score_path):
    print("make predictions on valid set and save the results for model selection")
    x_testing, y_testing = prepare_tensor(valid_set)
#     print("combine valid + test for more accurate model selection")
#     x_testing, y_testing = prepare_tensor(valid_set + test_set)
    print("len(y_testing) =", len(y_testing))
    print()
    y_pred_list = []
    for i, path in enumerate(model_paths):
        model = load_model(path)
        y_pred = model.predict(x_testing).flatten()
        y_pred_list.append(y_pred)
    model_score = [y_testing, y_pred_list]
    with open(model_score_path, 'wb') as file:
        pickle.dump(model_score, file)


In [None]:
##########################################################
##########################################################
##########################################################
# PREDICTION
##########################################################
##########################################################
##########################################################

In [None]:
# prediction functions

def model_selection(model_name):

    print("model selection")
    model_score_path = model_name + "/" + "model_score" + ".pkl"
    print("model_score_path =", model_score_path)
    with open(model_score_path, 'rb') as file:
        model_score = pickle.load(file) # y_testing, y_pred_list
    y_testing, y_pred_list = model_score
    print("len(y_pred_list) =", len(y_pred_list))
    print()

    print("AUC of the average of the best models")
    testing_auc_list = [roc_auc_score(y_testing, y_pred) for y_pred in y_pred_list]
#     pyplot.boxplot(testing_auc_list)
    sorted_auc_indices = sorted(range(len(testing_auc_list)), key=lambda k: -testing_auc_list[k])
    for best in [1, 10, 20, 40, 60, 80, len(testing_auc_list)]:
        y_pred_avg = np.mean([y_pred_list[i] for i in sorted_auc_indices[:best]], axis=0)
        y_pred_avg_auc = roc_auc_score(y_testing, y_pred_avg)
        print(best, "best models", y_pred_avg_auc)
    print()
    
#     # plot the score distribution of the average of the best models
#     print("plot the score distribution of the average of the best models")
#     best = 10
#     print(best, "best models")
#     y_score =  np.mean([y_pred_list[i] for i in sorted_auc_indices[:best]], axis=0)
#     fig, ax = pyplot.subplots(1, 2)
#     fpr, tpr, _ = roc_curve(y_testing, y_score)
#     auc = roc_auc_score(y_testing, y_score)
#     ax[0].plot(fpr,tpr,label="data 1, auc={:.2f}".format(auc))
#     ax[0].set_ylabel('True Positive Rate')
#     ax[0].set_xlabel('False Positive Rate')
#     ax[0].legend(loc=4)
#     y_score_0 = [b for a, b in zip(y_testing, y_score) if a == 0]
#     y_score_1 = [b for a, b in zip(y_testing, y_score) if a == 1]
#     my_dict = {'neg': y_score_0, 'pos': y_score_1}
#     ax[1].boxplot(my_dict.values())
#     ax[1].set_xticklabels(my_dict.keys())
#     pd_0 = pd.DataFrame({'neg': y_score_0})
#     pd_1 = pd.DataFrame({'pos': y_score_1})
#     pd.concat([pd_0, pd_1], ignore_index=True, axis=1).describe()

    return sorted_auc_indices


def predict(model_list, model_score, input_file, output_file):
#     print("predict()")
#     print("input_file = ", input_file)
#     print("output_file = ", output_file)

    if input_file[-3:] == 'csv':
        with open(input_file, 'r') as input_handle:
            csv_reader = csv.DictReader(input_handle, delimiter=',')
            csv_fieldnames = csv_reader.fieldnames
            csv_records = list(csv_reader)
    elif input_file[-3:] == 'txt':
        with open(input_file, 'r') as file:
            csv_fieldnames = ['peptide']
            csv_records = [{'peptide':x.strip()} for x in file.readlines()]
    print("number of input peptides =", len(csv_records))
    print()

    x_peptide = [record['peptide'] for record in csv_records]
    x_tensor = [[aa2index[aa] for aa in peptide] for peptide in x_peptide]
    x_tensor = tf.keras.preprocessing.sequence.pad_sequences(
      x_tensor, 
      maxlen=MAX_LEN,
      dtype='int32',
      padding='post',
      value=0)
    y_pred_list = []
    for model in model_list:
        y_pred = model.predict(x_tensor).flatten()
        y_pred_list.append(y_pred)
    y_pred = np.mean(y_pred_list, axis=0)

    model_y_test, model_y_pred = model_score
    model_y_pred_0 = [b for (a, b) in zip(model_y_test, model_y_pred) if a == 0]
    model_y_pred_1 = [b for (a, b) in zip(model_y_test, model_y_pred) if a == 1]
    with open(output_file, 'w') as output_handle:
        csv_fieldnames += ['dpImmun', 'dpImmun_neg_pct', 'dpImmun_pos_pct', 'dpImmun rank']
        csv_writer = csv.DictWriter(output_handle, csv_fieldnames)
        csv_writer.writeheader()
        for record, pred in zip(csv_records, list(y_pred)):
            record.update({'dpImmun': pred,
                           'dpImmun_neg_pct': np.sum(pred >= model_y_pred_0) / len(model_y_pred_0),
                           'dpImmun_pos_pct': np.sum(pred >= model_y_pred_1) / len(model_y_pred_1),
                           'dpImmun rank': 1 - np.sum(pred >= model_y_pred_0) / len(model_y_pred_0),
                          })
            csv_writer.writerow(record)

            
def patient_prediction(input_file, output_file, model_name):
    
    print("patient prediction")
    print("input_file = ", input_file)
    print("output_file = ", output_file)
    print()

    sorted_auc_indices = model_selection(model_name)
    model_number = len(sorted_auc_indices)
    model_paths = [model_name + "/" + "model" + "_" + patient + "_copy" + "_" + str(m) + ".h5" for m in range(model_number)]
    model_score_path = model_name + "/" + "model_score" + ".pkl"
    print("len(model_paths) =", len(model_paths))
    print("model_paths[0] =", model_paths[0])
    print("model_score_path =", model_score_path)
    print()

    # use average of the best models for predictions
    best = 10
    print(best, "best models")
    print()
    model_list = [load_model(model_paths[i]) for i in sorted_auc_indices[:best]]
    with open(model_score_path, 'rb') as file:
        model_score = pickle.load(file) # y_testing, y_pred_list
    y_testing, y_pred_list = model_score
    y_pred_avg = np.mean([y_pred_list[i] for i in sorted_auc_indices[:best]], axis=0)
    predict(model_list, [y_testing, y_pred_avg], input_file, output_file)


input_file = "test_csv/test100." + patient + ".csv"
output_file = input_file + ".dpimmun.csv"
model_name = "models/" + patient + "_copy"
patient_prediction(input_file, output_file, model_name)



In [None]:
##########################################################
##########################################################
##########################################################
# EVALUATION
##########################################################
##########################################################
##########################################################

In [None]:
# evaluation functions

def evaluate_sub(y_test, y_score):
    auc  = roc_auc_score(y_test, y_score)
    return auc


def evaluate(input_file, tool_name):
    y_test = []
    y_score = []
    with open(input_file, 'r') as input_handle:
        csv_reader = csv.DictReader(input_handle, delimiter=',')
        for row in csv_reader:
            peptide = row['peptide']
            response = row['response']
            assert response == "positive" or response == "negative"
            response = 1 if response == "positive" else 0
            y_test.append(response)
            score = float(row[tool_name])
            if "rank" in tool_name:
                score = 100 - score
            y_score.append(score)
    y_test = np.array(y_test)
    y_score = np.array(y_score)
    return y_test, y_score, evaluate_sub(y_test, y_score)


def patient_evaluation(eval_file):
    
    print("auc evaluation")
    print("eval_file =", eval_file)
    print()
    tools = ['dpImmun', 'PRIME2 %rank', 'NetMHCpan %rank', 'IEDB']
    tools_auc = [evaluate(eval_file, tool) for tool in tools]
    names = ['DeepImmun', 'PRIME', 'NetMHCpan', 'IEDB']
    colors = ['red', 'orange', 'green', 'blue']

    print("pos_ranks")
    pyplot.title('Receiver Operating Characteristic Curves')
    for y, n, c in zip(tools_auc, names, colors):
        fpr, tpr, threshold = roc_curve(y[0], y[1])
        pyplot.plot(fpr, tpr, color=c, label="{0:s}, AUC={1:0.2f}".format(n, y[2]))
        sorted_indices = np.argsort(-y[1])
        pos_ranks = [100*(rank+1)/len(y[0]) for rank, index in enumerate(sorted_indices) if y[0][index] == 1]
        pos_ranks = ','.join(['{0:.1f}'.format(x) for x in pos_ranks])
        print(n, '{0:.2f}'.format(y[2]), pos_ranks, sep=';')
    pyplot.legend(loc = 'lower right')
    pyplot.xlim([0, 1])
    pyplot.ylim([0, 1.05])
    pyplot.ylabel('True Positive Rate')
    pyplot.xlabel('False Positive Rate')
#     pyplot.savefig('test100.18patients.csv.dpimmun_copy.csv.png', dpi=600)
    print()
   

eval_file = "test_csv/test100." + patient + ".csv.dpimmun_copy.csv"
patient_evaluation(eval_file)

