In [1]:
import numpy
import pandas
import seaborn
import logging
import time
import collections
import os
from os import environ
from matplotlib import pyplot

import sklearn
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.linear_model

from six import string_types

%matplotlib inline
logging.basicConfig(level="DEBUG")

pandas.set_option('display.max_columns', 60)

from mhc2flurry.downloads import get_path
import mhc2flurry

from copy import deepcopy
import shutil
from Bio import SeqIO
import bz2

import tensorflow as tf
#config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
#session = tf.Session(config=config)
#K.set_session(session)

def ppv(y_true, predictions):
    df = pandas.DataFrame({"prediction": predictions, "y_true": y_true})
    return df.sort_values("prediction", ascending=False)[:int(y_true.sum())].y_true.mean()

import Bio.SeqIO
import traceback
from gzip import GzipFile
import Bio
import Bio.SeqUtils
from glob import glob
import json
from scipy.stats import pearsonr

import tqdm

from notebook.services.config import ConfigManager
c = ConfigManager()
c.update('notebook', {"CodeCell": {"cm_config": {"autoCloseBrackets": False}}})

import mhcgnomes

import mhc2flurry.allele_encoding_pair
import mhc2flurry.allele_encoding
import mhc2flurry.fasta


from mhcflurry.regression_target import from_ic50, to_ic50

DEBUG:root:Configured MHC2FLURRY_DOWNLOADS_DIR: /home/odonnt02/.local/share/mhc2flurry/1/0.0.1
DEBUG:tensorflow:Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
INFO:numexpr.utils:Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
DEBUG:root:Configured MHCFLURRY_DOWNLOADS_DIR: /home/odonnt02/.local/share/mhcflurry/4/2.0.0


In [2]:
!ls "$(mhc2flurry-downloads path data_curated)"

annotate_proteins.py				DOWNLOAD_INFO.csv
curated_training_data.affinity.csv.bz2		expression
curated_training_data.csv.bz2			GENERATE.sh
curated_training_data.mass_spec.csv.bz2		LOG.txt.bz2
curated_training_data.no_additional_ms.csv.bz2	ms.by_pmid.csv.bz2
curate_ms_by_pmid.py				rna_expression.csv.bz2
curate.py					rna_expression.metadata.csv.bz2


In [3]:
!cat "$(mhc2flurry-downloads path data_curated)/DOWNLOAD_INFO.csv"

url
https://github.com/openvax/mhc2flurry/releases/download/pre-0.0.1/data_curated.20210305.tar.bz2


In [4]:
!mhc2flurry-downloads info

Environment variables
  MHC2FLURRY_DATA_DIR                 [unset or empty]
  MHC2FLURRY_DOWNLOADS_CURRENT_RELEASE [unset or empty]
  MHC2FLURRY_DOWNLOADS_DIR            [unset or empty]
  MHC2FLURRY_DEFAULT_MODELS_DIR       [unset or empty]
  MHC2FLURRY_DOWNLOADS_GITHUB_AUTH_TOKEN [unset or empty]

Configuration
  current release                     = 0.0.1                
  downloads dir                       = /home/odonnt02/.local/share/mhc2flurry/1/0.0.1 [exists]

DOWNLOAD NAME                             DOWNLOADED?   UP TO DATE?   URL                  
allele_sequences                          YES           YES           https://github.com/openvax/mhc2flurry/releases/download/pre-0.0.1/allele_sequences.20210222.tar.bz2 
data_curated                              YES           YES           https://github.com/openvax/mhc2flurry/releases/download/pre-0.0.1/data_curated.20210305.tar.bz2 
data_published                            YES           YES           https://git

In [31]:
protein_column_converters = {}
for col in ["proteins_human", "proteins_mouse", "proteins_viral"]:
    protein_column_converters[col] = str.split


In [32]:
curated_df = pandas.read_csv(
    get_path("data_curated", "curated_training_data.csv.bz2"),
    converters=protein_column_converters)
curated_df

Unnamed: 0,allele,peptide,measurement_value,measurement_inequality,measurement_type,measurement_kind,measurement_source,original_allele,proteins_human,proteins_mouse,proteins_viral
0,BoLA-DRB3*001:01,AYAAQGYKVLVLNPSVAA,1541.0,=,quantitative,affinity,Walker - purified MHC/competitive/radioactivity,BoLA-DRB3*001:01,[],[],"[sp|O92972|POLG_HCVJ4, sp|O92532|POLG_HCVVP, s..."
1,BoLA-DRB3*001:01,CGKYLFNWAVRTKLKLTPIA,8776.0,=,quantitative,affinity,Walker - purified MHC/competitive/radioactivity,BoLA-DRB3*001:01,[],[],"[sp|Q03463|POLG_HCVJ1, sp|P26664|POLG_HCV1]"
2,BoLA-DRB3*001:01,ENLPYLVAYQATVCARAQAP,36805.0,=,quantitative,affinity,Walker - purified MHC/competitive/radioactivity,BoLA-DRB3*001:01,[],[],[sp|P26664|POLG_HCV1]
3,BoLA-DRB3*001:01,GIQYLAGLSTLPGNPAIASL,100000.0,>,quantitative,affinity,Walker - purified MHC/competitive/radioactivity,BoLA-DRB3*001:01,[],[],"[sp|O92972|POLG_HCVJ4, sp|O92532|POLG_HCVVP, s..."
4,BoLA-DRB3*001:01,KGGRKPARLIVFPDLGVRVC,3336.0,=,quantitative,affinity,Walker - purified MHC/competitive/radioactivity,BoLA-DRB3*001:01,[],[],"[sp|O92972|POLG_HCVJ4, sp|Q9WMX2|POLG_HCVCO, s..."
...,...,...,...,...,...,...,...,...,...,...,...
311559,SLA-DRB1*10:01,THVLTNLPL,100.0,<,qualitative,affinity,Meng - cellular MHC/direct/fluorescence,SLA-DRB1*10:01,[],[],[]
311560,SLA-DRB1*10:01,VSHLPVFFSHLFKSDSGYS,100.0,<,qualitative,affinity,Meng - cellular MHC/direct/fluorescence,SLA-DRB1*10:01,[],[],[]
311561,SLA-DRB1*10:01,WGFAAFTLF,100.0,<,qualitative,affinity,Meng - cellular MHC/direct/fluorescence,SLA-DRB1*10:01,[],[],[]
311562,SLA-DRB1*10:01,YEASPLAPL,100.0,<,qualitative,affinity,Meng - cellular MHC/direct/fluorescence,SLA-DRB1*10:01,[],[],[]


In [33]:
curated_df.measurement_type.value_counts()
curated_df.measurement_source.value_counts()
curated_df.measurement_kind.value_counts()
curated_df.loc[curated_df.measurement_kind == "mass_spec"].measurement_source.value_counts()


Falkenburg - cellular MHC/mass spectrometry             81031
Neidert - cellular MHC/mass spectrometry                45387
MS:pmid:31495665                                        43332
Sollid - cellular MHC/mass spectrometry                 18926
Alizadeh - cellular MHC/mass spectrometry                4058
                                                        ...  
Sollid - secreted MHC/mass spectrometry                     4
Urbaniak - secreted MHC/mass spectrometry                   4
LeibundGut-Landmann - secreted MHC/mass spectrometry        2
Poland - secreted MHC/mass spectrometry                     2
Purcell - cellular MHC/mass spectrometry                    1
Name: measurement_source, Length: 61, dtype: int64

In [34]:
curated_df.loc[curated_df.measurement_source.str.startswith("MS:pmid")].measurement_source.value_counts()

MS:pmid:31495665    43332
Name: measurement_source, dtype: int64

In [8]:
curated_df.loc[curated_df.measurement_kind == "mass_spec"].measurement_inequality.value_counts()

<    211651
Name: measurement_inequality, dtype: int64

In [36]:
ms_df = pandas.read_csv(
    get_path("data_curated", "ms.by_pmid.csv.bz2"),
    converters=protein_column_converters)
ms_df = ms_df.loc[
    ms_df.mhc_class == "II"
]
ms_df

Unnamed: 0,pmid,sample_id,peptide,format,mhc_class,hla,expression_dataset,cell_line,original_pmid,pulldown_antibody,sample_type,proteins_human,proteins_mouse,proteins_viral
0,29314611,MAVER-1_DR,TPEQWKSHKSYSCQVTHEGSTVEK,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,[],[],[]
1,29314611,MAVER-1_DR,KPISKAVIVLNEGIKVQTKE,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,[sp|O75976-2|CBPD-2_HUMAN],[],[]
2,29314611,MAVER-1_DR,DPSAVAKHFVALSTNTTKVKE,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,"[tr|A0A2R8Y6C7|A0A2R8Y6C7_HUMAN, sp|P06744-2|G...",[],[]
3,29314611,MAVER-1_DR,VSKALHKAALTIDEKGTEAVGSTFLE,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,[],[],[]
4,29314611,MAVER-1_DR,TPEQWKSHKSYSCQVTHEGSTVEKTVAPTE,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337967,31611696,3912_BAM-HLA-DR-DEPLETED,YYYIQQDTKGDYQKA,DR-DEPLETED,II,HLA-DRB1*03:01 HLA-DRB1*04:01 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[]
337968,31611696,4052_BA-HLA-DR,YYYIQQDTKGDYQKA,DR-SPECIFIC,II,HLA-DRB1*03:01 HLA-DRB1*11:04 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,HB298,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[]
337969,31611696,4052_BA-HLA-DR-DEPLETED,YYYIQQDTKGDYQKA,DR-DEPLETED,II,HLA-DRB1*03:01 HLA-DRB1*11:04 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[]
337970,31611696,3912_BAM-HLA-DR,YYYIQQDTKGDYQKAL,DR-SPECIFIC,II,HLA-DRB1*03:01 HLA-DRB1*04:01 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,HB298,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[]


In [37]:
ms_df.original_pmid.value_counts()

31611696    243301
31495665     66607
29314611     28064
Name: original_pmid, dtype: int64

In [38]:
ms_df.proteins_human.isnull().mean()

0.0

In [40]:
usable_ms_df = ms_df.loc[ms_df.proteins_human.str.len() > 0]
len(usable_ms_df)

256434

In [41]:
usable_ms_df.groupby("pmid").hla.nunique()

pmid
29314611     2
31495665    19
31611696    36
Name: hla, dtype: int64

In [42]:
# Train on monoallelic, validate on multiallelic
train_df = usable_ms_df.loc[usable_ms_df.format == "MONOALLELIC"]
print(len(train_df))
print(train_df.pmid.value_counts())
train_df.hla.value_counts()

33806
31495665    33806
Name: pmid, dtype: int64


HLA-DRA*01:01-DRB1*11:01     7085
HLA-DRA*01:01-DRB1*01:01     6036
HLA-DRA*01:01-DRB1*07:01     4329
HLA-DPA1*01:03-DPB1*06:01    4319
HLA-DRA*01:01-DRB1*15:01     2677
HLA-DRA*01:01-DRB1*03:01     1948
HLA-DRA*01:01-DRB1*04:01     1925
HLA-DRA*01:01-DRB3*01:01     1886
HLA-DRA*01:01-DRB1*12:01     1842
HLA-DQA1*01:02-DQB1*06:04    1759
Name: hla, dtype: int64

In [43]:
validation_df = usable_ms_df.loc[~usable_ms_df.peptide.isin(train_df.peptide)]
print(len(validation_df))
print(validation_df.pmid.value_counts())
validation_df.hla.value_counts()

192862
31611696    165262
29314611     14891
31495665     12709
Name: pmid, dtype: int64


HLA-DRB1*11:01 HLA-DRB3*02:02 HLA-DPA1*01:03 HLA-DPB1*04:01 HLA-DPB1*04:02 HLA-DQA1*05:05 HLA-DQB1*03:01                                                                               17213
HLA-DRB1*04:01 HLA-DRB1*08:01 HLA-DRB4*01:03 HLA-DPA1*01:03 HLA-DPB1*04:01 HLA-DPB1*04:02 HLA-DQA1*03:03 HLA-DQA1*04:01 HLA-DQB1*03:01 HLA-DQB1*04:02                                  13234
HLA-DRB1*07:01 HLA-DRB1*16:01 HLA-DRB4*01:03 HLA-DRB5*02:02 HLA-DPA1*01:03 HLA-DPB1*02:01 HLA-DPB1*23:01 HLA-DQA1*01:02 HLA-DQA1*02:01 HLA-DQB1*02:02 HLA-DQB1*05:02                   10605
HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 HLA-DQA1*01:01-DQB1*05:01 HLA-DQA1*01:03-DQB1*05:01 HLA-DQA1*01:01-DQB1*06:03 HLA-DQA1*01:03-DQB1*06:03                                   10316
HLA-DRB1*04:04 HLA-DRB1*11:01 HLA-DRB3*02:02 HLA-DRB4*01:03 HLA-DPA1*01:03 HLA-DPB1*02:01 HLA-DPB1*06:01 HLA-DQA1*03:01 HLA-DQA1*05:05 HLA-DQB1*03:01 HLA-DQB1*03:02                    8322
HLA-DRB1*11:01 HLA-DRB1*04:05 HLA-DRB3*02:02 HLA-DRB4*0

In [44]:
proteins_df = mhc2flurry.fasta.read_fasta_to_dataframe(
    get_path("data_proteomes", "human.uniprot.isoforms.fasta.gz")).set_index("sequence_id")
proteins_df

Unnamed: 0_level_0,sequence
sequence_id,Unnamed: 1_level_1
tr|A0A3F2YPB8|A0A3F2YPB8_HUMAN,MAATDLSYGLYRDPVCLQEKTEVERVVADCLTNCYQDSVTFDDLAV...
tr|A6NDF3|A6NDF3_HUMAN,MAATSGTDEPVSGELVSVAHALSLPAESYGNDPDIEMAWAMRAMQH...
tr|A6NDI8|A6NDI8_HUMAN,MTSTGQDSTTTRQRRSRQNPQSPPQDSSVTSKRNIKKGAVPRSIPN...
tr|A6NDJ3|A6NDJ3_HUMAN,MRPEGSLTYWVPERLRQGFCGVGRAAQALVELEPVNAQARKAFSRQ...
tr|A8MVT4|A8MVT4_HUMAN,MARNVVYPLYRLGGPQLRVFRTNFFIQLVRPGVAQPEDTVQFRIPM...
...,...
tr|U3KQA8|U3KQA8_HUMAN,MGTSSIFLCVLFLCGALGLTMSPARGRLRCYICGFTKPCHPVPTEC...
tr|U3KQK5|U3KQK5_HUMAN,MSVLTPLLLRGLTGSARRLPVPRAKIHSLPPEGKLGIMGKENGIGS...
tr|V9GXZ4|V9GXZ4_HUMAN,MRAVRRGLREGGAMAAARDPPEVSLREATQRKLRRFSELRGKLVAR...
tr|V9GYQ6|V9GYQ6_HUMAN,MIIMVIIFLVLLFWENEVNDEAVMSTLEHLHVDYPQNDVPVPARRD...


In [57]:
# Need to match lengths of hits and decoys. I do not expect there is important information we want to learn in
# the hit lengths.

import random

def add_decoys(hits_df, protein_to_sequence, protein_column="proteins_human", decoys_per_hit=10):
    """
    protein_to_sequence : dict like, str -> str
        Map from protein names to full amino acid sequences
        
    protein_sequences_df : pandas.DataFrame
        Should have columns: peptide, and the column specified in protein_column.
        All other columns will be copied 
    """
    hits_df = hits_df.loc[hits_df[protein_column].str.len() > 0].copy()
    hits_df["protein"] = hits_df[protein_column].str.get(0) # For now just taking first. Later can use expression info.
    hits_df["hit"] = 1
    hits_df["peptide_length"] = hits_df.peptide.str.len()

    # List of lists. Total number of lists is decoys_per_hit (e.g. 100).
    # The i'th decoy peptide in each list is generated based on the 
    # i'th hit peptide.
    decoy_peptides = [[] for _ in range(decoys_per_hit)]
    
    for protein, peptide_length in tqdm.tqdm(hits_df[["protein", "peptide_length"]].itertuples(index=False), total=len(hits_df)):
        sequence = protein_to_sequence[protein]
        for decoy_set in decoy_peptides:
            start = random.randrange(0, len(sequence) - peptide_length + 1)
            decoy_set.append(sequence[start : start + peptide_length])
        
    decoy_dfs = []
    for i in tqdm.tqdm(range(decoys_per_hit), total=decoys_per_hit):
        df = hits_df.copy()
        df["hit"] = 0
        df["peptide"] = decoy_peptides.pop(0)
        decoy_dfs.append(df)  
        
    result_df = pandas.concat([hits_df] + decoy_dfs, ignore_index=True)
    return result_df


validation_with_decoys_df = add_decoys(
    validation_df,
    proteins_df.sequence.to_dict(),
    protein_column="proteins_human",
    decoys_per_hit=100)
validation_with_decoys_df





  0%|          | 0/192862 [00:00<?, ?it/s][A[A[A[A



  0%|          | 537/192862 [00:00<00:35, 5366.01it/s][A[A[A[A



  1%|          | 1057/192862 [00:00<00:36, 5314.05it/s][A[A[A[A



  1%|          | 1575/192862 [00:00<00:36, 5272.29it/s][A[A[A[A



  1%|          | 2074/192862 [00:00<00:36, 5182.41it/s][A[A[A[A



  1%|▏         | 2597/192862 [00:00<00:36, 5195.45it/s][A[A[A[A



  2%|▏         | 3087/192862 [00:00<00:37, 5101.42it/s][A[A[A[A



  2%|▏         | 3636/192862 [00:00<00:36, 5210.28it/s][A[A[A[A



  2%|▏         | 4185/192862 [00:00<00:35, 5288.55it/s][A[A[A[A



  2%|▏         | 4724/192862 [00:00<00:35, 5317.77it/s][A[A[A[A



  3%|▎         | 5235/192862 [00:01<00:36, 5131.84it/s][A[A[A[A



  3%|▎         | 5735/192862 [00:01<00:37, 4990.52it/s][A[A[A[A



  3%|▎         | 6243/192862 [00:01<00:37, 4957.91it/s][A[A[A[A



  4%|▎         | 6777/192862 [00:01<00:36, 5064.01it/s][A[A[A[A



  4%|▍         |

 31%|███▏      | 60440/192862 [00:11<00:26, 5087.95it/s][A[A[A[A



 32%|███▏      | 60978/192862 [00:11<00:25, 5171.95it/s][A[A[A[A



 32%|███▏      | 61497/192862 [00:11<00:26, 5001.79it/s][A[A[A[A



 32%|███▏      | 62045/192862 [00:12<00:25, 5134.71it/s][A[A[A[A



 32%|███▏      | 62562/192862 [00:12<00:25, 5034.30it/s][A[A[A[A



 33%|███▎      | 63103/192862 [00:12<00:25, 5141.26it/s][A[A[A[A



 33%|███▎      | 63620/192862 [00:12<00:25, 5029.11it/s][A[A[A[A



 33%|███▎      | 64155/192862 [00:12<00:25, 5119.65it/s][A[A[A[A



 34%|███▎      | 64675/192862 [00:12<00:24, 5142.14it/s][A[A[A[A



 34%|███▍      | 65191/192862 [00:12<00:25, 5008.55it/s][A[A[A[A



 34%|███▍      | 65733/192862 [00:12<00:24, 5125.22it/s][A[A[A[A



 34%|███▍      | 66285/192862 [00:12<00:24, 5237.30it/s][A[A[A[A



 35%|███▍      | 66820/192862 [00:12<00:23, 5268.48it/s][A[A[A[A



 35%|███▍      | 67349/192862 [00:13<00:24, 5105.98it/s][A[A[

 62%|██████▏   | 120501/192862 [00:23<00:13, 5417.67it/s][A[A[A[A



 63%|██████▎   | 121052/192862 [00:23<00:13, 5443.64it/s][A[A[A[A



 63%|██████▎   | 121607/192862 [00:23<00:13, 5474.90it/s][A[A[A[A



 63%|██████▎   | 122157/192862 [00:23<00:12, 5481.26it/s][A[A[A[A



 64%|██████▎   | 122706/192862 [00:23<00:12, 5470.31it/s][A[A[A[A



 64%|██████▍   | 123254/192862 [00:23<00:12, 5448.01it/s][A[A[A[A



 64%|██████▍   | 123799/192862 [00:23<00:12, 5434.70it/s][A[A[A[A



 64%|██████▍   | 124343/192862 [00:24<00:12, 5418.29it/s][A[A[A[A



 65%|██████▍   | 124885/192862 [00:24<00:12, 5389.12it/s][A[A[A[A



 65%|██████▌   | 125424/192862 [00:24<00:12, 5369.51it/s][A[A[A[A



 65%|██████▌   | 125964/192862 [00:24<00:12, 5375.89it/s][A[A[A[A



 66%|██████▌   | 126502/192862 [00:24<00:12, 5187.83it/s][A[A[A[A



 66%|██████▌   | 127049/192862 [00:24<00:12, 5266.63it/s][A[A[A[A



 66%|██████▌   | 127577/192862 [00:24<00:12, 5253.0

 93%|█████████▎| 179744/192862 [00:34<00:02, 4922.92it/s][A[A[A[A



 93%|█████████▎| 180238/192862 [00:35<00:02, 4861.20it/s][A[A[A[A



 94%|█████████▎| 180726/192862 [00:35<00:02, 4866.22it/s][A[A[A[A



 94%|█████████▍| 181251/192862 [00:35<00:02, 4974.42it/s][A[A[A[A



 94%|█████████▍| 181750/192862 [00:35<00:02, 4820.44it/s][A[A[A[A



 94%|█████████▍| 182234/192862 [00:35<00:02, 4723.30it/s][A[A[A[A



 95%|█████████▍| 182709/192862 [00:35<00:02, 4162.14it/s][A[A[A[A



 95%|█████████▍| 183157/192862 [00:35<00:02, 4251.10it/s][A[A[A[A



 95%|█████████▌| 183620/192862 [00:35<00:02, 4357.43it/s][A[A[A[A



 95%|█████████▌| 184069/192862 [00:35<00:02, 4394.65it/s][A[A[A[A



 96%|█████████▌| 184532/192862 [00:36<00:01, 4460.88it/s][A[A[A[A



 96%|█████████▌| 184994/192862 [00:36<00:01, 4505.72it/s][A[A[A[A



 96%|█████████▌| 185453/192862 [00:36<00:01, 4529.36it/s][A[A[A[A



 96%|█████████▋| 185908/192862 [00:36<00:01, 4126.1

Unnamed: 0,pmid,sample_id,peptide,format,mhc_class,hla,expression_dataset,cell_line,original_pmid,pulldown_antibody,sample_type,proteins_human,proteins_mouse,proteins_viral,protein,hit,peptide_length
0,29314611,MAVER-1_DR,KPISKAVIVLNEGIKVQTKE,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,[sp|O75976-2|CBPD-2_HUMAN],[],[],sp|O75976-2|CBPD-2_HUMAN,1,20
1,29314611,MAVER-1_DR,DPSAVAKHFVALSTNTTKVKE,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,"[tr|A0A2R8Y6C7|A0A2R8Y6C7_HUMAN, sp|P06744-2|G...",[],[],tr|A0A2R8Y6C7|A0A2R8Y6C7_HUMAN,1,21
2,29314611,MAVER-1_DR,TPAKPSSPPPEFSFNTPGKNVN,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,"[sp|Q6NXT4-2|ZNT6-2_HUMAN, sp|Q6NXT4-3|ZNT6-3_...",[],[],sp|Q6NXT4-2|ZNT6-2_HUMAN,1,22
3,29314611,MAVER-1_DR,REIDDHDAVLRFNGAPTANFQQDVGTK,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,[tr|H7C472|H7C472_HUMAN],[],[],tr|H7C472|H7C472_HUMAN,1,27
4,29314611,MAVER-1_DR,RPGGVVHSFSHNVGPGDK,DR-SPECIFIC,II,HLA-DRB1*01:01 HLA-DRB1*13:01 HLA-DRB3*02:02 H...,sample_type:B-CELL,MAVER-1,29314611,L243 (HLA-DR),B-CELL,[tr|M0QYN0|M0QYN0_HUMAN],[],[],tr|M0QYN0|M0QYN0_HUMAN,1,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19479057,31611696,3912_BAM-HLA-DR-DEPLETED,YGSVKAYTNFDAERD,DR-DEPLETED,II,HLA-DRB1*03:01 HLA-DRB1*04:01 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[],sp|P07355-2|ANXA2-2_HUMAN,0,15
19479058,31611696,4052_BA-HLA-DR,WISIMTERSVPHLQK,DR-SPECIFIC,II,HLA-DRB1*03:01 HLA-DRB1*11:04 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,HB298,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[],sp|P07355-2|ANXA2-2_HUMAN,0,15
19479059,31611696,4052_BA-HLA-DR-DEPLETED,DFRKLMVALAKGRRA,DR-DEPLETED,II,HLA-DRB1*03:01 HLA-DRB1*11:04 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[],sp|P07355-2|ANXA2-2_HUMAN,0,15
19479060,31611696,3912_BAM-HLA-DR,STPPSAYGSVKAYTNF,DR-SPECIFIC,II,HLA-DRB1*03:01 HLA-DRB1*04:01 HLA-DRB3*01:01 H...,sample_type:MENINGIOMA,,31611696,HB298,MENINGIOMA,[sp|P07355-2|ANXA2-2_HUMAN],[tr|B0V2N5|B0V2N5_MOUSE],[],sp|P07355-2|ANXA2-2_HUMAN,0,16


In [62]:
train_with_decoys_df = add_decoys(
    train_df,
    proteins_df.sequence.to_dict(),
    protein_column="proteins_human",
    decoys_per_hit=10)
train_with_decoys_df





  0%|          | 0/33806 [00:00<?, ?it/s][A[A[A[A



  7%|▋         | 2366/33806 [00:00<00:01, 23654.27it/s][A[A[A[A



 17%|█▋        | 5637/33806 [00:00<00:01, 25796.78it/s][A[A[A[A



 30%|██▉       | 10011/33806 [00:00<00:00, 29416.54it/s][A[A[A[A



 43%|████▎     | 14578/33806 [00:00<00:00, 32931.76it/s][A[A[A[A



 57%|█████▋    | 19152/33806 [00:00<00:00, 35951.35it/s][A[A[A[A



 70%|███████   | 23774/33806 [00:00<00:00, 38518.06it/s][A[A[A[A



 84%|████████▍ | 28470/33806 [00:00<00:00, 40712.70it/s][A[A[A[A



 98%|█████████▊| 33061/33806 [00:00<00:00, 42143.28it/s][A[A[A[A



100%|██████████| 33806/33806 [00:00<00:00, 41288.09it/s][A[A[A[A



  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A



100%|██████████| 10/10 [00:00<00:00, 121.50it/s][A[A[A[A

Unnamed: 0,pmid,sample_id,peptide,format,mhc_class,hla,expression_dataset,cell_line,original_pmid,pulldown_antibody,sample_type,proteins_human,proteins_mouse,proteins_viral,protein,hit,peptide_length
0,31495665,MAPTAC_DRB1*12:01_DM-,VPGPGPAPMPSDFQVLRAKY,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[sp|O95479-2|G6PE-2_HUMAN],[],[],sp|O95479-2|G6PE-2_HUMAN,1,20
1,31495665,MAPTAC_DRB1*12:01_DM-,ALMGYATHKYLDSEEDEE,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|P09912-2|IFI6-2_HUMAN, sp|P09912-3|IFI6-3_...",[],[],sp|P09912-2|IFI6-2_HUMAN,1,18
2,31495665,MAPTAC_DRB1*12:01_DM-,GSDQSENVDRGAGSIREA,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|Q9UII2-2|ATIF1-2_HUMAN, sp|Q9UII2-3|ATIF1-...",[],[],sp|Q9UII2-2|ATIF1-2_HUMAN,1,18
3,31495665,MAPTAC_DRB1*12:01_DM-,FNYRRRRPENPKPQDGKETKAAD,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[tr|H0Y449|H0Y449_HUMAN],[tr|A2BGG7|A2BGG7_MOUSE],[],tr|H0Y449|H0Y449_HUMAN,1,23
4,31495665,MAPTAC_DRB1*12:01_DM-,FNYRRRRPENPKPQDGKETKAADPPAE,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[tr|H0Y449|H0Y449_HUMAN],[tr|A2BGG7|A2BGG7_MOUSE],[],tr|H0Y449|H0Y449_HUMAN,1,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371861,31495665,MAPTAC_DRB1*15:01,VGTTHDLLDI,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[tr|J3KS79|J3KS79_HUMAN, tr|K7ENH5|K7ENH5_HUMA...","[tr|E9PZH4|E9PZH4_MOUSE, tr|E9Q6V3|E9Q6V3_MOUSE]",[],tr|J3KS79|J3KS79_HUMAN,0,10
371862,31495665,MAPTAC_DRB1*15:01,HDNQHWQTAPFWTLGPFCA,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[sp|Q8IWU5-2|SULF2-2_HUMAN],[],[],sp|Q8IWU5-2|SULF2-2_HUMAN,0,19
371863,31495665,MAPTAC_DRB1*15:01,LPTTTQRVGVPTAVQNL,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[tr|H0YEU6|H0YEU6_HUMAN, tr|H0Y5F5|H0Y5F5_HUMA...","[tr|A3KFU5|A3KFU5_MOUSE, tr|A3KFU8|A3KFU8_MOUS...",[],tr|H0YEU6|H0YEU6_HUMAN,0,17
371864,31495665,MAPTAC_DRB1*15:01,QLSLTEEDDSGINDED,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|Q8IY82-2|DRC7-2_HUMAN, tr|H3BRR8|H3BRR8_HU...",[],[],sp|Q8IY82-2|DRC7-2_HUMAN,0,16


In [65]:
allele_sequences_df = pandas.read_csv(get_path("allele_sequences", "allele_sequences.csv"), index_col=0)
allele_sequences_variant = allele_sequences_df.columns[0]
print("using variant", allele_sequences_variant)
allele_sequences_alpha = allele_sequences_df.loc[allele_sequences_df.kind == "alpha", allele_sequences_variant]
allele_sequences_beta = allele_sequences_df.loc[allele_sequences_df.kind == "beta", allele_sequences_variant]
allele_sequences_alpha, allele_sequences_beta

using variant maf_0.2_and_0.3_within_6.0_angstrom


(sequence_id
 HLA-DPA1*02:38Q    SYMFFQRAFSEGGAIILNTL
 HLA-DPA1*01:03     SYAFFMQAFSEGGAIILNTL
 HLA-DPA1*01:03Q    SYAFFMQAFSEGGAIILNTL
 HLA-DPA1*01:14     SYAFFMQAFSEGGAIILNTL
 HLA-DPA1*01:18     SYAFFMQAFSEGGAIILNTL
                            ...         
 HLA-DPA1*01:46     SYAFFMQAFSEGGAIILNTL
 HLA-DPA1*01:05     SYAFFMQAFSEGGAIILNTL
 HLA-DPA1*01:41     SYAFFMQAFSEGGAIILNTL
 HLA-DPA1*02:03     SYAFFMRAFSEGGAIILNTL
 HLA-DPA1*02:16     SYAFFQRAFSEGGAIILNTL
 Name: maf_0.2_and_0.3_within_6.0_angstrom, Length: 174, dtype: object,
 sequence_id
 HLA-DQB1*05:03      YFGGTHYVYDVGASRVEVAYGI
 HLA-DQB1*05:08      YFGGTHYVYDVGASRVEVAYGI
 HLA-DQB1*05:10      YFGGTHYVYDVGASRVEVAYGI
 HLA-DQB1*05:108     YFGGTHYVYDVGASRVEVAYGI
 HLA-DQB1*05:134     YFGGTHYVYDVGASRVEVAYGI
                              ...          
 HLA-DQB1*03:422N    YFGLTYYAYAVRTETVQLELTT
 HLA-DRB1*12:06      ESGLEHLLFVIDRATYGAVEFT
 HLA-DRB4*01:124     EACNIYYAYDLRRETYGVVEFT
 HLA-DRB5*01:08N     QDYFHGNVYDFDRATYGVGEFT
 HLA-DQB1*0

In [68]:
train_with_decoys_df["parsed_allele"] = train_with_decoys_df.hla.map(lambda s: mhcgnomes.parse(s, infer_class2_pairing=True))
train_with_decoys_df

Unnamed: 0,pmid,sample_id,peptide,format,mhc_class,hla,expression_dataset,cell_line,original_pmid,pulldown_antibody,sample_type,proteins_human,proteins_mouse,proteins_viral,protein,hit,peptide_length,parsed_allele
0,31495665,MAPTAC_DRB1*12:01_DM-,VPGPGPAPMPSDFQVLRAKY,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[sp|O95479-2|G6PE-2_HUMAN],[],[],sp|O95479-2|G6PE-2_HUMAN,1,20,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
1,31495665,MAPTAC_DRB1*12:01_DM-,ALMGYATHKYLDSEEDEE,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|P09912-2|IFI6-2_HUMAN, sp|P09912-3|IFI6-3_...",[],[],sp|P09912-2|IFI6-2_HUMAN,1,18,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
2,31495665,MAPTAC_DRB1*12:01_DM-,GSDQSENVDRGAGSIREA,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|Q9UII2-2|ATIF1-2_HUMAN, sp|Q9UII2-3|ATIF1-...",[],[],sp|Q9UII2-2|ATIF1-2_HUMAN,1,18,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
3,31495665,MAPTAC_DRB1*12:01_DM-,FNYRRRRPENPKPQDGKETKAAD,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[tr|H0Y449|H0Y449_HUMAN],[tr|A2BGG7|A2BGG7_MOUSE],[],tr|H0Y449|H0Y449_HUMAN,1,23,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
4,31495665,MAPTAC_DRB1*12:01_DM-,FNYRRRRPENPKPQDGKETKAADPPAE,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[tr|H0Y449|H0Y449_HUMAN],[tr|A2BGG7|A2BGG7_MOUSE],[],tr|H0Y449|H0Y449_HUMAN,1,27,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371861,31495665,MAPTAC_DRB1*15:01,VGTTHDLLDI,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[tr|J3KS79|J3KS79_HUMAN, tr|K7ENH5|K7ENH5_HUMA...","[tr|E9PZH4|E9PZH4_MOUSE, tr|E9Q6V3|E9Q6V3_MOUSE]",[],tr|J3KS79|J3KS79_HUMAN,0,10,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
371862,31495665,MAPTAC_DRB1*15:01,HDNQHWQTAPFWTLGPFCA,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[sp|Q8IWU5-2|SULF2-2_HUMAN],[],[],sp|Q8IWU5-2|SULF2-2_HUMAN,0,19,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
371863,31495665,MAPTAC_DRB1*15:01,LPTTTQRVGVPTAVQNL,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[tr|H0YEU6|H0YEU6_HUMAN, tr|H0Y5F5|H0Y5F5_HUMA...","[tr|A3KFU5|A3KFU5_MOUSE, tr|A3KFU8|A3KFU8_MOUS...",[],tr|H0YEU6|H0YEU6_HUMAN,0,17,Class2Pair(alpha=Allele(gene=Gene(species=Spec...
371864,31495665,MAPTAC_DRB1*15:01,QLSLTEEDDSGINDED,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|Q8IY82-2|DRC7-2_HUMAN, tr|H3BRR8|H3BRR8_HU...",[],[],sp|Q8IY82-2|DRC7-2_HUMAN,0,16,Class2Pair(alpha=Allele(gene=Gene(species=Spec...


In [78]:
from mhc2flurry.amino_acid import COMMON_AMINO_ACIDS
COMMON_AMINO_ACIDS

aa_regex = "^[%s]+$" % "".join(sorted(COMMON_AMINO_ACIDS))
aa_regex
train_with_decoys_df.peptide.str.match(aa_regex).mean()

0.999416456465501

In [79]:
use_train_df = train_with_decoys_df.loc[
    train_with_decoys_df.parsed_allele.map(
        lambda p: isinstance(p, mhcgnomes.Class2Pair))
].copy()

use_train_df = use_train_df.loc[use_train_df.peptide.str.match(aa_regex)]

use_train_df["alpha_allele"] = train_with_decoys_df.parsed_allele.map(lambda p: p.alpha.to_string())
use_train_df["beta_allele"] = train_with_decoys_df.parsed_allele.map(lambda p: p.beta.to_string())

use_train_df = use_train_df.loc[
    (use_train_df.alpha_allele.isin(allele_sequences_alpha.index)) &
    (use_train_df.beta_allele.isin(allele_sequences_beta.index))
].copy()

use_train_df = use_train_df.loc[
    (use_train_df.alpha_allele.isin(allele_sequences_alpha.index)) &
    (use_train_df.beta_allele.isin(allele_sequences_beta.index))
].copy()

use_train_df["allele"] = use_train_df.hla.map(lambda s: mhcgnomes.parse(s, infer_class2_pairing=True).to_string())
use_train_df

Unnamed: 0,pmid,sample_id,peptide,format,mhc_class,hla,expression_dataset,cell_line,original_pmid,pulldown_antibody,sample_type,proteins_human,proteins_mouse,proteins_viral,protein,hit,peptide_length,parsed_allele,alpha_allele,beta_allele,allele
0,31495665,MAPTAC_DRB1*12:01_DM-,VPGPGPAPMPSDFQVLRAKY,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[sp|O95479-2|G6PE-2_HUMAN],[],[],sp|O95479-2|G6PE-2_HUMAN,1,20,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*12:01,HLA-DRA*01:01-DRB1*12:01
1,31495665,MAPTAC_DRB1*12:01_DM-,ALMGYATHKYLDSEEDEE,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|P09912-2|IFI6-2_HUMAN, sp|P09912-3|IFI6-3_...",[],[],sp|P09912-2|IFI6-2_HUMAN,1,18,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*12:01,HLA-DRA*01:01-DRB1*12:01
2,31495665,MAPTAC_DRB1*12:01_DM-,GSDQSENVDRGAGSIREA,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|Q9UII2-2|ATIF1-2_HUMAN, sp|Q9UII2-3|ATIF1-...",[],[],sp|Q9UII2-2|ATIF1-2_HUMAN,1,18,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*12:01,HLA-DRA*01:01-DRB1*12:01
3,31495665,MAPTAC_DRB1*12:01_DM-,FNYRRRRPENPKPQDGKETKAAD,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[tr|H0Y449|H0Y449_HUMAN],[tr|A2BGG7|A2BGG7_MOUSE],[],tr|H0Y449|H0Y449_HUMAN,1,23,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*12:01,HLA-DRA*01:01-DRB1*12:01
4,31495665,MAPTAC_DRB1*12:01_DM-,FNYRRRRPENPKPQDGKETKAADPPAE,MONOALLELIC,II,HLA-DRA*01:01-DRB1*12:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[tr|H0Y449|H0Y449_HUMAN],[tr|A2BGG7|A2BGG7_MOUSE],[],tr|H0Y449|H0Y449_HUMAN,1,27,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*12:01,HLA-DRA*01:01-DRB1*12:01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371861,31495665,MAPTAC_DRB1*15:01,VGTTHDLLDI,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[tr|J3KS79|J3KS79_HUMAN, tr|K7ENH5|K7ENH5_HUMA...","[tr|E9PZH4|E9PZH4_MOUSE, tr|E9Q6V3|E9Q6V3_MOUSE]",[],tr|J3KS79|J3KS79_HUMAN,0,10,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*15:01,HLA-DRA*01:01-DRB1*15:01
371862,31495665,MAPTAC_DRB1*15:01,HDNQHWQTAPFWTLGPFCA,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,[sp|Q8IWU5-2|SULF2-2_HUMAN],[],[],sp|Q8IWU5-2|SULF2-2_HUMAN,0,19,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*15:01,HLA-DRA*01:01-DRB1*15:01
371863,31495665,MAPTAC_DRB1*15:01,LPTTTQRVGVPTAVQNL,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[tr|H0YEU6|H0YEU6_HUMAN, tr|H0Y5F5|H0Y5F5_HUMA...","[tr|A3KFU5|A3KFU5_MOUSE, tr|A3KFU8|A3KFU8_MOUS...",[],tr|H0YEU6|H0YEU6_HUMAN,0,17,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*15:01,HLA-DRA*01:01-DRB1*15:01
371864,31495665,MAPTAC_DRB1*15:01,QLSLTEEDDSGINDED,MONOALLELIC,II,HLA-DRA*01:01-DRB1*15:01,cell_line:EXPI293,EXPI293,31495665,MAPTAC,EXPI293,"[sp|Q8IY82-2|DRC7-2_HUMAN, tr|H3BRR8|H3BRR8_HU...",[],[],sp|Q8IY82-2|DRC7-2_HUMAN,0,16,Class2Pair(alpha=Allele(gene=Gene(species=Spec...,HLA-DRA*01:01,HLA-DRB1*15:01,HLA-DRA*01:01-DRB1*15:01


In [80]:
#use_train_df.allele.value_counts()

In [81]:
import mhc2flurry.allele_encoding_pair
import mhc2flurry.allele_encoding

allele_encoding_pair = mhc2flurry.allele_encoding_pair.AlleleEncodingPair(
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_train_df.alpha_allele.values,
        allele_to_sequence=allele_sequences_alpha.to_dict()),
    mhc2flurry.allele_encoding.AlleleEncoding(
        use_train_df.beta_allele.values,
        allele_to_sequence=allele_sequences_beta.to_dict()))
allele_encoding_pair

<mhc2flurry.allele_encoding_pair.AlleleEncodingPair at 0x7f69357bb5c0>

In [82]:
import imp
import mhc2flurry.condconv
imp.reload(mhc2flurry.condconv)

import mhc2flurry.class2_neural_network
imp.reload(mhc2flurry.class2_neural_network)
import mhc2flurry.class2_neural_network


model = mhc2flurry.class2_neural_network.Class2NeuralNetwork(
    random_negative_rate=1.0,
    random_negative_binder_threshold=2000,
    layer_sizes=[8],
    patience=5,
    peptide_convolutions=[
        {'kernel_size': 9, 'filters': 64, 'activation': "relu"},
        {'kernel_size': 1, 'filters': 16, 'activation': "relu"},
        {'kernel_size': 16, 'filters': 16, 'activation': "relu"},
    ],
)
print(model.hyperparameters)

model.fit(
    use_train_df.peptide.values,
    affinities=use_train_df.hit.values,
    allele_encoding_pair=allele_encoding_pair
)

{'random_negative_rate': 1.0, 'random_negative_binder_threshold': 2000, 'layer_sizes': [8], 'patience': 5, 'peptide_convolutions': [{'kernel_size': 9, 'filters': 64, 'activation': 'relu'}, {'kernel_size': 1, 'filters': 16, 'activation': 'relu'}, {'kernel_size': 16, 'filters': 16, 'activation': 'relu'}], 'allele_amino_acid_encoding': 'BLOSUM62', 'allele_dense_layer_sizes': [], 'allele_positionwise_embedding_size': 32, 'peptide_encoding': {'vector_encoding_name': 'BLOSUM62', 'alignment_method': 'right_pad', 'max_length': 50}, 'dense_layer_l1_regularization': 0.001, 'dense_layer_l2_regularization': 0.0, 'activation': 'tanh', 'init': 'glorot_uniform', 'output_activation': 'sigmoid', 'dropout_probability': 0.0, 'batch_normalization': False, 'topology': 'feedforward', 'num_outputs': 1, 'loss': 'custom:mse_with_inequalities', 'optimizer': 'rmsprop', 'learning_rate': None, 'max_epochs': 500, 'validation_split': 0.1, 'early_stopping': True, 'minibatch_size': 128, 'data_dependent_initialization_

INFO:root:Using amino acid distribution for random negative:
{'V': 0.06674147900182593, 'P': 0.061115308758415296, 'G': 0.07213969860569708, 'A': 0.07448236329930819, 'M': 0.018172457290471258, 'S': 0.07049569536269996, 'D': 0.05402081697509211, 'F': 0.03660132275255189, 'Q': 0.04577655378125101, 'L': 0.09009553019286076, 'R': 0.05805031640129534, 'K': 0.05973221462322228, 'Y': 0.029358445266292808, 'T': 0.053627348036778745, 'H': 0.025355661074277645, 'E': 0.07255480107604317, 'N': 0.039106747106623985, 'I': 0.043839408919635926, 'W': 0.011527187977680002, 'C': 0.017206643497976612}
INFO:root:Random negative plan [by_allele]:
                                    8     9     10    11    12    13    14  \
(HLA-DPA1*01:03, HLA-DPB1*06:01)  5937  5937  5937  5937  5937  5937  5937   
(HLA-DQA1*01:02, HLA-DQB1*06:04)  2418  2418  2418  2418  2418  2418  2418   
(HLA-DRA*01:01, HLA-DRB1*01:01)   8295  8295  8295  8295  8295  8295  8295   
(HLA-DRA*01:01, HLA-DRB1*03:01)   2677  2677  2677  2

Model: "predictor"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
alpha_allele (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
beta_allele (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
alpha_allele_representation (Em (None, 1, 420)       73500       alpha_allele[0][0]               
__________________________________________________________________________________________________
beta_allele_representation (Emb (None, 1, 462)       571494      beta_allele[0][0]                
__________________________________________________________________________________________

Epoch  12 / 500 [59.22 sec]: loss=0.0401855. Min val loss (0.07364790886640549) at epoch 9
Epoch 14/14
Epoch  13 / 500 [59.55 sec]: loss=0.0400862. Min val loss (0.07364790886640549) at epoch 9
Epoch 15/15
Epoch  14 / 500 [59.65 sec]: loss=0.03996. Min val loss (0.07364790886640549) at epoch 9
Epoch 16/16
Epoch  15 / 500 [59.76 sec]: loss=0.0398947. Min val loss (0.07364790886640549) at epoch 9
Epoch 17/17
Epoch  16 / 500 [59.71 sec]: loss=0.0398352. Min val loss (0.07331274449825287) at epoch 15
Epoch 18/18
Epoch  17 / 500 [59.63 sec]: loss=0.0397941. Min val loss (0.07331274449825287) at epoch 15
Epoch 19/19
Epoch  18 / 500 [59.23 sec]: loss=0.0397145. Min val loss (0.07331274449825287) at epoch 15
Epoch 20/20
Epoch  19 / 500 [59.64 sec]: loss=0.0396353. Min val loss (0.07310142368078232) at epoch 18
Epoch 21/21
Epoch  20 / 500 [59.55 sec]: loss=0.0395715. Min val loss (0.07310142368078232) at epoch 18
Epoch 22/22
Epoch  21 / 500 [59.64 sec]: loss=0.0395267. Min val loss (0.072228550

In [None]:
# Setup multialleic validation

validation_df = train_ms_df.loc[
    (train_ms_df.format == "MONOALLELIC") &
    (~train_ms_df.peptide.isin(use_train_df.peptide))
].copy()
validation_df["parsed_allele"] = validation_df.hla.map(lambda s: mhcgnomes.parse(s, infer_class2_pairing=True))

validation_df = validation_df.loc[validation_df.parsed_allele.map(lambda p: isinstance(p, mhcgnomes.Class2Pair))].copy()
validation_df["alpha_allele"] = validation_df.parsed_allele.map(lambda p: p.alpha.to_string())
validation_df["beta_allele"] = validation_df.parsed_allele.map(lambda p: p.beta.to_string())

validation_df

In [None]:
validation_df.pmid.value_counts()

In [None]:
validation_df.beta_allele.value_counts()

In [None]:
validation_allele_encoding_pair = mhc2flurry.allele_encoding_pair.AlleleEncodingPair(
    mhc2flurry.allele_encoding.AlleleEncoding(
        validation_df.alpha_allele.values,
        allele_to_sequence=allele_sequences_alpha.to_dict()),
    mhc2flurry.allele_encoding.AlleleEncoding(
        validation_df.beta_allele.values,
        allele_to_sequence=allele_sequences_beta.to_dict()))

validation_df["prediction"] = to_ic50(
    model.predict(
        validation_df.peptide.values,
        allele_encoding_pair=validation_allele_encoding_pair))
validation_df

In [None]:
scores_df = []
to_score = validation_df.copy()
for allele, sub_validation_df in validation_df.groupby("hla"):
    to_score["hit"] = 0
    to_score.loc[sub_validation_df.index, "hit"] = 1
    scores_df.append((
        allele,
        sklearn.metrics.roc_auc_score(to_score.hit, -1 * to_score.prediction),
    ))

scores_df = pandas.DataFrame(scores_df, columns=["allele", "auc"])
scores_df = scores_df.sort_values("auc")

seaborn.barplot(data=scores_df, y="allele", x="auc", color='black')
#pyplot.xlim(xmin=0.5)
pyplot.ylabel("Allele")
seaborn.despine()
scores_df

In [None]:
to_score

In [None]:
validation_df.original_pmid.value_counts()