In [None]:
import numpy
import pandas
import seaborn
import logging
import time
import collections
import os
from os import environ
from matplotlib import pyplot

import sklearn
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.linear_model

from six import string_types

import mhcflurry
import mhcnames

%matplotlib inline
logging.basicConfig(level="DEBUG")

pandas.set_option('display.max_columns', 60)

from mhc2flurry.downloads import get_path
from mhcflurry.common import random_peptides
import mhc2flurry

from copy import deepcopy
from mhcflurry.regression_target import from_ic50, to_ic50
import shutil
from Bio import SeqIO
import bz2

import tensorflow as tf
#config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
#session = tf.Session(config=config)
#K.set_session(session)

def ppv(y_true, predictions):
    df = pandas.DataFrame({"prediction": predictions, "y_true": y_true})
    return df.sort_values("prediction", ascending=False)[:int(y_true.sum())].y_true.mean()

import Bio.SeqIO
import traceback
from gzip import GzipFile
import Bio
import Bio.SeqUtils
from glob import glob
import json
from scipy.stats import pearsonr

import tqdm

from notebook.services.config import ConfigManager
c = ConfigManager()
c.update('notebook', {"CodeCell": {"cm_config": {"autoCloseBrackets": False}}})

import mhcgnomes

import mhc2flurry.allele_encoding_pair
import mhc2flurry.allele_encoding

In [None]:
!ls "$(mhc2flurry-downloads path data_curated)"

In [None]:
train_df = pandas.read_csv(get_path("data_curated", "curated_training_data.csv.bz2"))
train_df

In [None]:
train_ms_df = pandas.read_csv(get_path("data_curated", "ms.by_pmid.csv.bz2"))
train_ms_df = train_ms_df.loc[
    train_ms_df.mhc_class == "II"
]
train_ms_df

In [None]:
affinity_df = train_df.loc[
    (~train_df.peptide.isin(train_ms_df.peptide)) &
    (train_df.measurement_kind == "affinity")
]
affinity_df

In [None]:
allele_sequences_alpha = pandas.read_csv(
    get_path("allele_sequences", "alpha.csv"), index_col=0)
allele_sequences_alpha = pandas.Series([
    "".join(row)
    for (allele, row) in allele_sequences_alpha.iterrows()
], index=allele_sequences_alpha.index)
allele_sequences_alpha

allele_sequences_beta = pandas.read_csv(
    get_path("allele_sequences", "beta.csv"), index_col=0)
allele_sequences_beta = pandas.Series([
    "".join(row)
    for (allele, row) in allele_sequences_beta.iterrows()
], index=allele_sequences_beta.index)
allele_sequences_beta

In [None]:
mhcgnomes.parse("Patr-DRB1*03:08", infer_class2_pairing=True)

In [None]:
mhcgnomes.parse("HLA-DRB1*03:08", infer_class2_pairing=True)

In [None]:
train_df.loc[
    ~train_df.peptide.isin(train_ms_df.peptide)
]

In [None]:
train_df["parsed_allele"] = train_df.allele.map(lambda s: mhcgnomes.parse(s, infer_class2_pairing=True))
train_df

In [None]:
use_train_df = train_df.loc[train_df.parsed_allele.map(lambda p: isinstance(p, mhcgnomes.Class2Pair))].copy()
use_train_df["alpha_allele"] = use_train_df.parsed_allele.map(lambda p: p.alpha.to_string())
use_train_df["beta_allele"] = use_train_df.parsed_allele.map(lambda p: p.beta.to_string())

use_train_df = use_train_df.loc[
    (use_train_df.alpha_allele.isin(allele_sequences_alpha.index)) &
    (use_train_df.beta_allele.isin(allele_sequences_beta.index))
].copy()

use_train_df = use_train_df.loc[
    (use_train_df.measurement_kind == "affinity")
].copy()
use_train_df["allele"] = use_train_df.allele.map(lambda s: mhcgnomes.parse(s, infer_class2_pairing=True).to_string())
use_train_df

In [None]:
use_train_df.allele.value_counts()

In [None]:
validation_df = train_ms_df.loc[
    (train_ms_df.format == "MONOALLELIC") &
    (~train_ms_df.peptide.isin(use_train_df.peptide))
].copy()
validation_df["parsed_allele"] = validation_df.hla.map(lambda s: mhcgnomes.parse(s, infer_class2_pairing=True))

validation_df = validation_df.loc[validation_df.parsed_allele.map(lambda p: isinstance(p, mhcgnomes.Class2Pair))].copy()
validation_df["alpha_allele"] = validation_df.parsed_allele.map(lambda p: p.alpha.to_string())
validation_df["beta_allele"] = validation_df.parsed_allele.map(lambda p: p.beta.to_string())

validation_df

In [None]:
validation_df.hla.value_counts()

In [None]:
allele = "HLA-DRA*01:01-DRB1*01:01"
validation_df.loc[validation_df.hla == allele].shape, use_train_df.loc[use_train_df.allele == allele].shape

In [None]:
allele_sequences_alpha.to_dict()

In [None]:
import mhc2flurry.allele_encoding_pair
import mhc2flurry.allele_encoding

allele_encoding_pair = mhc2flurry.allele_encoding_pair.AlleleEncodingPair(
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_train_df.alpha_allele.values,
        allele_to_sequence=allele_sequences_alpha.to_dict()),
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_train_df.beta_allele.values,
        allele_to_sequence=allele_sequences_beta.to_dict()))
allele_encoding_pair

In [None]:
import imp
import mhc2flurry.condconv
imp.reload(mhc2flurry.condconv)

import mhc2flurry.class2_neural_network
imp.reload(mhc2flurry.class2_neural_network)
import mhc2flurry.class2_neural_network


model = mhc2flurry.class2_neural_network.Class2NeuralNetwork(
    random_negative_rate=1.0,
    layer_sizes=[8],
    patience=5,
    peptide_convolutions=[
        {'kernel_size': 9, 'filters': 64, 'activation': "relu"},
        {'kernel_size': 1, 'filters': 16, 'activation': "relu"},
        {'kernel_size': 16, 'filters': 16, 'activation': "relu"},
    ],
)
print(model.hyperparameters)

model.fit(
    use_train_df.peptide.values,
    affinities=use_train_df["measurement_value"].values,
    inequalities=use_train_df["measurement_inequality"].values,
    allele_encoding_pair=allele_encoding_pair
)

In [None]:
validation_allele_encoding_pair = mhc2flurry.allele_encoding_pair.AlleleEncodingPair(
    mhc2flurry.allele_encoding.AlleleEncoding(
        validation_df.alpha_allele.values,
        allele_to_sequence=allele_sequences_alpha.to_dict()),
    mhc2flurry.allele_encoding.AlleleEncoding(
        validation_df.beta_allele.values,
        allele_to_sequence=allele_sequences_beta.to_dict()))

validation_df["prediction"] = model.predict(
    validation_df.peptide.values,
    allele_encoding_pair=validation_allele_encoding_pair)
validation_df

In [None]:
scores_df = []
to_score = validation_df.copy()
for allele, sub_validation_df in validation_df.groupby("hla"):
    to_score["hit"] = 0
    to_score.loc[sub_validation_df.index, "hit"] = 1
    scores_df.append((
        allele,
        sklearn.metrics.roc_auc_score(to_score.hit, -1 * to_score.prediction),
    ))

scores_df = pandas.DataFrame(scores_df, columns=["allele", "auc"])
scores_df = scores_df.sort_values("auc")

seaborn.barplot(data=scores_df, y="allele", x="auc", color='black')
#pyplot.xlim(xmin=0.5)
pyplot.ylabel("Allele")
seaborn.despine()
scores_df

In [None]:
#!pip install --upgrade tensorflow