In [1]:
import numpy
import pandas
import seaborn
import logging
import time
import collections
import os
from os import environ
from matplotlib import pyplot

import sklearn
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.linear_model

%matplotlib inline
logging.basicConfig(level="DEBUG")
pandas.set_option('display.max_columns', 60)

from copy import deepcopy
import shutil
import bz2

import tensorflow as tf

def ppv(y_true, predictions):
    df = pandas.DataFrame({"prediction": predictions, "y_true": y_true})
    return df.sort_values("prediction", ascending=False)[:int(y_true.sum())].y_true.mean()

import logomaker

import traceback
import json
from scipy.stats import pearsonr

import tqdm

from notebook.services.config import ConfigManager
c = ConfigManager()
c.update('notebook', {"CodeCell": {"cm_config": {"autoCloseBrackets": False}}})

import mhcgnomes

import mhc2flurry
from mhc2flurry.downloads import get_path
import mhc2flurry.allele_encoding_pair
import mhc2flurry.allele_encoding
import mhc2flurry.fasta
import mhc2flurry.common
import mhc2flurry.encodable_sequences

from mhcflurry.regression_target import from_ic50, to_ic50

import tensorflow as tf ; print("GPU AVAILABLE" if tf.test.is_gpu_available() else "GPU NOT AVAILABLE")

DEBUG:tensorflow:Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
DEBUG:matplotlib.font_manager:font search path [PosixPath('/hpc/users/odonnt02/.conda/envs/py36b/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf'), PosixPath('/hpc/users/odonnt02/.conda/envs/py36b/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/afm'), PosixPath('/hpc/users/odonnt02/.conda/envs/py36b/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/pdfcorefonts')]
DEBUG:root:Configured MHC2FLURRY_DOWNLOADS_DIR: /hpc/users/odonnt02/.local/share/mhc2flurry/1/0.0.1
DEBUG:root:Configured MHCFLURRY_DOWNLOADS_DIR: /hpc/users/odonnt02/.local/share/mhcflurry/4/2.1.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


RuntimeError: CUDA runtime implicit initialization on GPU:0 failed. Status: all CUDA-capable devices are busy or unavailable

DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [None]:
!hostname

In [None]:
!ls "$(mhc2flurry-downloads path data_curated)"

In [None]:
!cat "$(mhc2flurry-downloads path data_curated)/DOWNLOAD_INFO.csv"

In [None]:
!mhc2flurry-downloads info

In [None]:
protein_column_converters = {}
for col in ["proteins_human", "proteins_mouse", "proteins_viral"]:
    protein_column_converters[col] = str.split


In [None]:
curated_df = pandas.read_csv(
    get_path("data_curated", "curated_training_data.csv.bz2"),
    converters=protein_column_converters)
curated_df

In [None]:
curated_df.measurement_type.value_counts()
curated_df.measurement_source.value_counts()
curated_df.measurement_kind.value_counts()
curated_df.loc[curated_df.measurement_kind == "mass_spec"].measurement_source.value_counts()


In [None]:
curated_df.loc[curated_df.measurement_source.str.startswith("MS:pmid")].measurement_source.value_counts()

In [None]:
curated_df.loc[curated_df.measurement_kind == "mass_spec"].measurement_inequality.value_counts()

In [None]:
ms_df = pandas.read_csv(
    get_path("data_curated", "ms.by_pmid.csv.bz2"),
    converters=protein_column_converters)
ms_df = ms_df.loc[
    ms_df.mhc_class == "II"
]
ms_df["hla"] = ms_df.hla.str.split().map(tuple)
ms_df["allele_pairs"] = ms_df.hla.map(mhc2flurry.common.make_allele_pairs).map(tuple)
ms_df

In [None]:
# TODO: how to make pairs from e.g. HLA-DPA1*01:03, HLA-DPB1*04:01 ? Need to take all combinations?

In [None]:
ms_df.original_pmid.value_counts()

In [None]:
ms_df.proteins_human.isnull().mean()

In [None]:
usable_ms_df = ms_df.loc[ms_df.proteins_human.str.len() > 0]
len(usable_ms_df)

In [None]:
# Train on monoallelic, validate on multiallelic
train_df = usable_ms_df.loc[usable_ms_df.format == "MONOALLELIC"].copy()
assert (train_df.allele_pairs.str.len() == 1).all()
train_df["allele"] = train_df.allele_pairs.str.get(0)
train_df["parsed_allele"] = train_df.allele.map(lambda s: mhcgnomes.parse(s, infer_class2_pairing=True))

print(len(train_df))
print(train_df.pmid.value_counts())
train_df.allele.value_counts()

In [None]:
validation_df = usable_ms_df.loc[~usable_ms_df.peptide.isin(train_df.peptide)]
print(len(validation_df))
print(validation_df.pmid.value_counts())
validation_df.allele_pairs.value_counts()

In [None]:
proteins_df = mhc2flurry.fasta.read_fasta_to_dataframe(
    get_path("data_proteomes", "human.uniprot.isoforms.fasta.gz")).set_index("sequence_id")
proteins_df

In [None]:
# Need to match lengths of hits and decoys. I do not expect there is important information we want to learn in
# the hit lengths.

import random

def add_decoys(hits_df, protein_to_sequence, protein_column="proteins_human", decoys_per_hit=10):
    """
    protein_to_sequence : dict like, str -> str
        Map from protein names to full amino acid sequences
        
    protein_sequences_df : pandas.DataFrame
        Should have columns: peptide, and the column specified in protein_column.
        All other columns will be copied 
    """
    hits_df = hits_df.loc[hits_df[protein_column].str.len() > 0].copy()
    hits_df["protein"] = hits_df[protein_column].str.get(0) # For now just taking first. Later can use expression info.
    hits_df["hit"] = 1
    hits_df["peptide_length"] = hits_df.peptide.str.len()

    # List of lists. Total number of lists is decoys_per_hit (e.g. 100).
    # The i'th decoy peptide in each list is generated based on the 
    # i'th hit peptide.
    decoy_peptides = [[] for _ in range(decoys_per_hit)]
    
    for protein, peptide_length in tqdm.tqdm(hits_df[["protein", "peptide_length"]].itertuples(index=False), total=len(hits_df)):
        sequence = protein_to_sequence[protein]
        for decoy_set in decoy_peptides:
            start = random.randrange(0, len(sequence) - peptide_length + 1)
            decoy_set.append(sequence[start : start + peptide_length])
        
    decoy_dfs = []
    for i in tqdm.tqdm(range(decoys_per_hit), total=decoys_per_hit):
        df = hits_df.copy()
        df["hit"] = 0
        df["peptide"] = decoy_peptides.pop(0)
        decoy_dfs.append(df)  
        
    result_df = pandas.concat([hits_df] + decoy_dfs, ignore_index=True)
    return result_df


validation_with_decoys_df = add_decoys(
    validation_df,
    proteins_df.sequence.to_dict(),
    protein_column="proteins_human",
    decoys_per_hit=1)
validation_with_decoys_df

In [None]:
train_with_decoys_df = add_decoys(
    train_df,
    proteins_df.sequence.to_dict(),
    protein_column="proteins_human",
    decoys_per_hit=1)
train_with_decoys_df

In [None]:
assert (train_with_decoys_df.allele_pairs.str.len() == 1).all()
train_with_decoys_df.allele.value_counts()


In [None]:
allele_sequences_df = pandas.read_csv(get_path("allele_sequences", "allele_sequences.csv"), index_col=0)
allele_sequences_variant = allele_sequences_df.columns[0]
print("using variant", allele_sequences_variant)
allele_sequences_alpha = allele_sequences_df.loc[allele_sequences_df.kind == "alpha", allele_sequences_variant]
allele_sequences_beta = allele_sequences_df.loc[allele_sequences_df.kind == "beta", allele_sequences_variant]
allele_sequences_alpha, allele_sequences_beta

In [None]:
from mhc2flurry.amino_acid import COMMON_AMINO_ACIDS
COMMON_AMINO_ACIDS

aa_regex = "^[%s]+$" % "".join(sorted(COMMON_AMINO_ACIDS))
aa_regex
train_with_decoys_df.peptide.str.match(aa_regex).mean()

In [None]:
use_train_df = train_with_decoys_df.loc[
    train_with_decoys_df.parsed_allele.map(
        lambda p: isinstance(p, mhcgnomes.Class2Pair))
].copy()
print("Excluding", train_with_decoys_df.loc[~train_with_decoys_df.index.isin(use_train_df.index)].allele.unique())

use_train_df = use_train_df.loc[use_train_df.peptide.str.match(aa_regex)]

use_train_df["alpha_allele"] = train_with_decoys_df.parsed_allele.map(lambda p: p.alpha.to_string())
use_train_df["beta_allele"] = train_with_decoys_df.parsed_allele.map(lambda p: p.beta.to_string())

use_train_df = use_train_df.loc[
    (use_train_df.alpha_allele.isin(allele_sequences_alpha.index)) &
    (use_train_df.beta_allele.isin(allele_sequences_beta.index))
].copy()

use_train_df

In [None]:
show_df = use_train_df.loc[use_train_df.hit == 1].allele.value_counts().sort_index().to_frame()
show_df.index.name = "allele"
show_df.columns = ['peptides']
show_df

In [None]:
show_df.shape, show_df.peptides.sum()

In [None]:
validation_with_decoys_df.pmid.value_counts()

In [None]:
def flatten_multiallelic(df, hla_column="allele_pairs"):
    result = []
    for allele_pairs, sub_df in tqdm.tqdm(df.groupby("allele_pairs"), total=df[hla_column].nunique()):
        sub_df = sub_df.copy()
        sub_df["original_index"] = sub_df.index
            
        for allele in allele_pairs:
            sub_df = sub_df.copy()
            parsed_allele = mhcgnomes.parse(allele, infer_class2_pairing=True)
            #print(allele, parsed_allele)
            sub_df["allele"] = parsed_allele.to_string()
            sub_df["parsed_allele"] = parsed_allele
            sub_df["alpha_allele"] =  parsed_allele.alpha.to_string()
            sub_df["beta_allele"] =  parsed_allele.beta.to_string()
            result.append(sub_df)
    result = pandas.concat(result, ignore_index=True)
    return result

use_validation_df = flatten_multiallelic(validation_with_decoys_df)

use_validation_df = use_validation_df.loc[
    (use_validation_df.alpha_allele.isin(allele_sequences_alpha.index)) &
    (use_validation_df.beta_allele.isin(allele_sequences_beta.index)) &
    (use_validation_df.peptide.str.match(aa_regex))
].copy()

use_validation_df

In [None]:
import mhc2flurry.allele_encoding_pair
import mhc2flurry.allele_encoding

train_allele_encoding_pair = mhc2flurry.allele_encoding_pair.AlleleEncodingPair(
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_train_df.alpha_allele.values,
        allele_to_sequence=allele_sequences_alpha.to_dict()),
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_train_df.beta_allele.values,
        allele_to_sequence=allele_sequences_beta.to_dict()))
train_allele_encoding_pair

In [None]:
validation_allele_encoding_pair = mhc2flurry.allele_encoding_pair.AlleleEncodingPair(
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_validation_df.alpha_allele.values,
        allele_to_sequence=allele_sequences_alpha.to_dict()),
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_validation_df.beta_allele.values,
        allele_to_sequence=allele_sequences_beta.to_dict()))
validation_allele_encoding_pair

In [None]:
train_peptides = mhc2flurry.encodable_sequences.EncodableSequences.create(use_train_df.peptide.values)
validation_peptides = mhc2flurry.encodable_sequences.EncodableSequences.create(use_validation_df.peptide.values)

In [None]:
import imp
import mhc2flurry.condconv
imp.reload(mhc2flurry.condconv)

import mhc2flurry.class2_neural_network
imp.reload(mhc2flurry.class2_neural_network)
import mhc2flurry.class2_neural_network


model = mhc2flurry.class2_neural_network.Class2NeuralNetwork(
    minibatch_size=1024,
    random_negative_rate=0.0,
    random_negative_binder_threshold=2000,
    layer_sizes=[],
    patience=5,
    dense_layer_l1_regularization=0.0,
    peptide_convolutions=[
        {'kernel_size': 9, 'filters': 64, 'activation': "relu"},
        {'kernel_size': 1, 'filters': 16, 'activation': "relu"},
        #{'kernel_size': 16, 'filters': 16, 'activation': "relu"},
    ],
    allele_dense_layer_sizes=[],
    allele_positionwise_embedding_size=32,
    activation="tanh",
    
    
)
print(model.hyperparameters)

#train_peptides = mhc2flurry.encodable_sequences.EncodableSequences.create(use_train_df.peptide.values)
#validation_peptides = mhc2flurry.encodable_sequences.EncodableSequences.create(use_validation_df.peptide.values)
fit_history = []
epoch = 0
def progress_callback(model=model):
    global epoch
    if epoch % 5 == 0:
        start = time.time()
        train_predictions = model.predict(
            train_peptides,
            allele_encoding_pair=train_allele_encoding_pair)
        train_auc = sklearn.metrics.roc_auc_score(use_train_df.hit, train_predictions)
        print("Train AUC [%0.3f sec]: %0.5f" % (time.time() - start, train_auc))

        start = time.time()
        use_validation_df["prediction"] = model.predict(
            validation_peptides,
            allele_encoding_pair=validation_allele_encoding_pair)
        grouped = use_validation_df.groupby("original_index")[["prediction", "hit"]].max()
        validation_auc = sklearn.metrics.roc_auc_score(grouped.hit.values, grouped.prediction.values)
        print("Validation AUC [%0.3f sec]: %0.5f" % (time.time() - start, validation_auc))
        
        validation_max_peptide_std = use_validation_df.groupby("peptide").prediction.std().max()
        print("Validation max peptide std", validation_max_peptide_std)

        fit_history.append((epoch, train_auc, validation_auc, validation_max_peptide_std))
    epoch += 1

model.fit(
    use_train_df.peptide.values,
    affinities=use_train_df.hit.values,
    allele_encoding_pair=train_allele_encoding_pair,
    progress_callback=progress_callback
)

fit_history = pandas.DataFrame(
    fit_history, columns=["epoch", "train_auc", "validation_auc", "validation_max_peptide_std"])
fit_history

In [None]:
seaborn.set_context('talk')
pyplot.figure(figsize=(6,2))
fit_history.set_index("epoch")[["train_auc", "validation_auc"]].rename(columns={"train_auc": "Train", "validation_auc": "Validation"}).plot(kind='line')
seaborn.despine()
pyplot.ylabel("Accuracy (AUC)")
pyplot.xlabel("Training Epoch")
pyplot.axhline(0.9013242464065097, label="Validation, NetMHCIIpan 4.0 EL", ls="--", color="green")
pyplot.legend()
pyplot.ylim(ymin=0.5, ymax=1)

In [None]:
model.network().summary()

In [None]:
fit_history.sort_values("validation_auc")

In [None]:
use_validation_df["prediction"] = model.predict(
    validation_peptides,
    allele_encoding_pair=validation_allele_encoding_pair)
grouped = use_validation_df.groupby("original_index")[["prediction", "hit"]].max()
auc = sklearn.metrics.roc_auc_score(grouped.hit.values, grouped.prediction.values)
print("Validation AUC: %0.5f" % auc)

In [None]:
use_validation_df.columns

In [None]:
write_df = use_validation_df[["pmid", "original_index", "sample_id", "peptide", "hit", "allele", "prediction"]]
write_df.to_csv("validation.csv", index=False)
!ls -lh validation.csv
!bzip2 -f validation.csv
!ls -lh validation.csv.bz2

In [None]:
# Visualization of motifs
peptide_length = 9
all_proteome_peptides = set()
for seq in tqdm.tqdm(proteins_df.sequence.values):
    for i in range(len(seq) - peptide_length):
        all_proteome_peptides.add(seq[i : i + peptide_length])

all_proteome_peptides = pandas.Series(sorted(all_proteome_peptides))
all_proteome_peptides = all_proteome_peptides[all_proteome_peptides.str.match(aa_regex)]
all_proteome_peptides = all_proteome_peptides.sample(frac=0.1)
        
proteome_predictions_df = pandas.DataFrame(index=all_proteome_peptides.values).sample(frac=0.1)
proteome_predictions_df

In [None]:
# Do predictions for motif visualization

alleles_of_interest = list(train_df.allele.unique())
alleles_of_interest

proteome_peptides = mhc2flurry.encodable_sequences.EncodableSequences.create(proteome_predictions_df.index)
for allele in tqdm.tqdm(alleles_of_interest):
    parsed = mhcgnomes.parse(allele)
    
    proteome_predictions_df[allele] = model.predict(
        proteome_peptides,
        allele_encoding_pair=mhc2flurry.allele_encoding_pair.AlleleEncodingPair(
            mhc2flurry.allele_encoding.AlleleEncoding(
                [parsed.alpha.to_string()] * len(proteome_predictions_df),
                allele_to_sequence=allele_sequences_alpha.to_dict()),
            mhc2flurry.allele_encoding.AlleleEncoding(
                [parsed.beta.to_string()] * len(proteome_predictions_df),
                allele_to_sequence=allele_sequences_beta.to_dict())))
proteome_predictions_df    
    
        

In [None]:
plot_df = proteome_predictions_df.sample(frac=0.01)
plot_df.columns = plot_df.columns.str.replace("HLA-", "").str.replace("DRA*01:01-", "", regex=False).str.replace("-", "\n")

seaborn.set_context('talk')
pyplot.figure(figsize=(20,16))
seaborn.pairplot(plot_df, kind='reg')

In [None]:
%time background_counts = logomaker.alignment_to_matrix(proteome_predictions_df.index.to_series().sample(frac=0.1))
background_counts

In [None]:
def make_logo(allele):
    top_peptides = proteome_predictions_df[allele].nlargest(int(len(proteome_predictions_df) * 0.01))
    top_peptide_counts = logomaker.alignment_to_matrix(top_peptides.index.to_series())
    pwm = logomaker.transform_matrix(
        top_peptide_counts,
        background=background_counts,
        from_type='counts',
        to_type='weight')
    pwm.index += 1
    adjusted_pwm = pwm.applymap(lambda value: value if value < 0 else value * 10)
    
    logomaker.Logo(adjusted_pwm)
    pyplot.title(allele)
    seaborn.despine()

for allele in proteome_predictions_df.columns:
    make_logo(allele)

In [None]:
# Next step:
# - Make a sequence logo of learned motifs
# - Look at accuracy on individual alleles

In [None]:
"""
scores_df = []
to_score = validation_df.copy()
for allele, sub_validation_df in validation_df.groupby("hla"):
    to_score["hit"] = 0
    to_score.loc[sub_validation_df.index, "hit"] = 1
    scores_df.append((
        allele,
        sklearn.metrics.roc_auc_score(to_score.hit, -1 * to_score.prediction),
    ))

scores_df = pandas.DataFrame(scores_df, columns=["allele", "auc"])
scores_df = scores_df.sort_values("auc")

seaborn.barplot(data=scores_df, y="allele", x="auc", color='black')
#pyplot.xlim(xmin=0.5)
pyplot.ylabel("Allele")
seaborn.despine()
scores_df
"""