In [1]:
import pandas as pd
import numpy as np
import os
import operator
import random

login = os.getlogin()
data_dir = f'/home/{login}/Git/tc-hard/tc-hard-data/'

In [2]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
set_random_seed(42)

In [3]:
def print_stats(df, mhc="allele"):

    if mhc == "allele":
        mhc_col = "mhc.a"
    elif mhc == "seq":
        mhc_col = "mhc.seq"
    else:
        raise NotImplementedError

    print('Total samples:', len(df))

    temp = df.copy()
    temp = temp[["cdr3.beta", "antigen.epitope", 'label']].dropna().drop_duplicates()
    pos = temp[temp['label']==1]
    neg = temp[temp['label']==0]
    print('\n With CDR3b + pep: ',len(temp))
    print("Non-binding samples: ", len(neg))
    print("Binding samples: ", len(pos))

    temp = df.copy()
    temp = temp[["cdr3.beta", "antigen.epitope", 'label', mhc_col]].dropna().drop_duplicates()
    pos = temp[temp['label']==1]
    neg = temp[temp['label']==0]
    print(f'\n With CDR3b + pep + MHC {mhc}: ',len(temp))
    print("Non-binding samples: ", len(neg))
    print("Binding samples: ", len(pos))

    temp = df.copy()
    temp = temp[["cdr3.beta", "antigen.epitope", 'label', "cdr3.alpha"]].dropna().drop_duplicates()
    pos = temp[temp['label']==1]
    neg = temp[temp['label']==0]
    print(f'\n With CDR3b + pep + CDR3a {mhc}: ',len(temp))
    print("Non-binding samples: ", len(neg))
    print("Binding samples: ", len(pos))

    temp = df.copy()
    temp = temp[["cdr3.alpha", "cdr3.beta", "antigen.epitope", 'label', mhc_col]].dropna().drop_duplicates()
    pos = temp[temp['label']==1]
    neg = temp[temp['label']==0]
    print(f'\n With CDR3b + pep + CDR3a + MHC {mhc}: ',len(temp))
    print("Non-binding samples: ", len(neg))
    print("Binding samples: ", len(pos))

# VDJdb

In [4]:
vdjdb_df = pd.read_csv(data_dir+'vdjdb-2021-09-05/vdjdb_full.txt', sep="\t")
vdjdb_df = vdjdb_df[vdjdb_df['species'] == 'HomoSapiens']

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
vdjdb_df = vdjdb_df[["cdr3.alpha", "cdr3.beta", "mhc.a", "antigen.epitope", "v.alpha", "j.alpha", "v.beta", "d.beta", "j.beta"]]

# remove rows with NaN CDR3 beta and peptides (for CDR3 alpha and MHC we accept NaN)
vdjdb_df.dropna(subset=["cdr3.beta", "antigen.epitope"], inplace=True)

In [6]:
# filter for non-aa characters in CDR3 beta sequences;
# do the same in CDR3 alpha if they're not NeN (keep the NaN alpha)
alpha_nan_or_is_amino = operator.or_(vdjdb_df['cdr3.alpha'].str.match('^[A-Z]+$') == True, vdjdb_df['cdr3.alpha'].isna())
beta_is_amino = vdjdb_df['cdr3.beta'].str.match('^[A-Z]+$') == True
vdjdb_df = vdjdb_df.loc[alpha_nan_or_is_amino & beta_is_amino]

In [7]:
# if CDR3 alpha and CDR3 beta do not start with `C`, add it
vdjdb_df['cdr3.beta'] = vdjdb_df['cdr3.beta'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)
vdjdb_df['cdr3.alpha'] = vdjdb_df['cdr3.alpha'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)

# filter for unclear/PTM epitopes
vdjdb_df = vdjdb_df.loc[(vdjdb_df['antigen.epitope'].str.match('^[A-Z]+$') == True)].reset_index(drop=True)

In [8]:
# filter for duplicates
vdjdb_df = vdjdb_df.drop_duplicates(subset=['cdr3.alpha', 'cdr3.beta', 'antigen.epitope', 'mhc.a'], keep='first')

In [9]:
# add label - all samples are positive in this dataset
vdjdb_df['label'] = 1

# mark negative source
vdjdb_df["negative.source"] = np.nan

In [10]:
print_stats(vdjdb_df)

Total samples: 39809

 With CDR3b + pep:  35936
Non-binding samples:  0
Binding samples:  35936

 With CDR3b + pep + MHC allele:  36296
Non-binding samples:  0
Binding samples:  36296

 With CDR3b + pep + CDR3a allele:  23402
Non-binding samples:  0
Binding samples:  23402

 With CDR3b + pep + CDR3a + MHC allele:  23446
Non-binding samples:  0
Binding samples:  23446


# McPAS-TCR

In [11]:
mcpas_df = pd.read_csv(data_dir+'mcpas-tcr-2022-02-22/McPAS-TCR.csv', engine='python')
mcpas_df = mcpas_df[mcpas_df['Species'] == 'Human']

In [12]:
mcpas_df = mcpas_df[[
    'CDR3.alpha.aa', 'CDR3.beta.aa', 'Epitope.peptide', 
    'MHC', "TRAV", "TRAJ", "TRBV", "TRBD", "TRBJ"
]]

In [13]:
mcpas_df = mcpas_df.rename(columns={
    'CDR3.alpha.aa': 'cdr3.alpha', 
    'CDR3.beta.aa': 'cdr3.beta', 
    'Epitope.peptide': 'antigen.epitope', 
    'MHC': 'mhc.a',
    "TRAV": "v.alpha", 
    "TRAJ": "j.alpha", 
    "TRBV": "v.beta", 
    "TRBD": "d.beta", 
    "TRBJ": "j.beta"
})

In [14]:
# remove rows with NaN CDR3 beta and peptides (for CDR3 alpha and MHC we accept NaN)
mcpas_df.dropna(subset=["cdr3.beta", "antigen.epitope"], inplace=True)

In [15]:
# filter for non-aa characters in CDR3 beta sequences;
# do the same in CDR3 alpha if they're not NeN (keep the NaN alpha)
alpha_nan_or_is_amino = operator.or_(mcpas_df['cdr3.alpha'].str.match('^[A-Z]+$') == True, mcpas_df['cdr3.alpha'].isna())
beta_is_amino = mcpas_df['cdr3.beta'].str.match('^[A-Z]+$') == True
mcpas_df = mcpas_df.loc[alpha_nan_or_is_amino & beta_is_amino]

In [16]:
# if CDR3 alpha and CDR3 beta do not start with `C`, add it
mcpas_df['cdr3.beta'] = mcpas_df['cdr3.beta'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)
mcpas_df['cdr3.alpha'] = mcpas_df['cdr3.alpha'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)

# filter for unclear/PTM epitopes
mcpas_df = mcpas_df.loc[(mcpas_df['antigen.epitope'].str.match('^[A-Z]+$') == True)].reset_index(drop=True)

In [17]:
# filter for duplicates
mcpas_df = mcpas_df.drop_duplicates(subset=['cdr3.alpha', 'cdr3.beta', 'antigen.epitope', 'mhc.a'], keep='first')

In [18]:
# add label - all samples are positive in this dataset
mcpas_df['label'] = 1

# mark negative source
mcpas_df["negative.source"] = np.nan

In [19]:
print_stats(mcpas_df)

Total samples: 10719

 With CDR3b + pep:  10232
Non-binding samples:  0
Binding samples:  10232

 With CDR3b + pep + MHC allele:  10332
Non-binding samples:  0
Binding samples:  10332

 With CDR3b + pep + CDR3a allele:  2305
Non-binding samples:  0
Binding samples:  2305

 With CDR3b + pep + CDR3a + MHC allele:  2270
Non-binding samples:  0
Binding samples:  2270


# IEDB
already pre-processed with `TCRpair/data_prep_iedb.py`

In [20]:
iedb_pos_df = pd.read_csv(data_dir+'iedb-2022-02-22/pos_iedb.csv')
iedb_neg_df = pd.read_csv(data_dir+'iedb-2022-02-22/neg_iedb.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [21]:
iedb_pos_df = iedb_pos_df[["cdr3.alpha", "cdr3.beta", "mhc.a", "antigen.epitope", "v.alpha", "v.beta", "j.alpha", "j.beta", "d.beta"]]
iedb_neg_df = iedb_neg_df[["cdr3.alpha", "cdr3.beta", "mhc.a", "antigen.epitope", "v.alpha", "v.beta", "j.alpha", "j.beta", "d.beta"]]

In [22]:
iedb_pos_df['label'] = 1
iedb_neg_df['label'] = 0

In [23]:
# mark negative source
iedb_pos_df["negative.source"] = np.nan
iedb_neg_df["negative.source"] = "iedb"

In [24]:
print_stats(pd.concat([iedb_pos_df, iedb_neg_df]))

Total samples: 150771

 With CDR3b + pep:  146979
Non-binding samples:  947
Binding samples:  146032

 With CDR3b + pep + MHC allele:  146189
Non-binding samples:  171
Binding samples:  146018

 With CDR3b + pep + CDR3a allele:  20739
Non-binding samples:  100
Binding samples:  20639

 With CDR3b + pep + CDR3a + MHC allele:  20727
Non-binding samples:  81
Binding samples:  20646


#  MIRA set

In [25]:
mira_df = pd.read_csv(data_dir+'mira/mira_eval_threshold100.csv', sep=';')

In [26]:
mira_df = mira_df.rename(columns={"CDR3": 'cdr3.beta', 'peptide': 'antigen.epitope', 'binder': 'label'})
mira_df = mira_df.drop_duplicates()
mira_df = mira_df.dropna()

In [27]:
# no gene information in the MIRA set
mira_df["v.alpha"] = np.nan
mira_df["v.beta"] = np.nan
mira_df["j.alpha"] = np.nan
mira_df["j.beta"] = np.nan
mira_df["d.beta"] = np.nan

In [28]:
# if CDR3 alpha and CDR3 beta do not start with `C`, add it
mcpas_df['cdr3.beta'] = mcpas_df['cdr3.beta'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)

# filter for unclear/PTM epitopes
mcpas_df = mcpas_df.loc[(mcpas_df['antigen.epitope'].str.match('^[A-Z]+$') == True)].reset_index(drop=True)

In [29]:
mira_df["cdr3.alpha"] = np.nan
mira_df["mhc.a"] = np.nan

In [30]:
# mark negative source
mira_df["negative.source"] = np.nan
mira_df.loc[mira_df["label"] == 0, "negative.source"] = "mira"

In [31]:
print_stats(mira_df)

Total samples: 1152

 With CDR3b + pep:  1152
Non-binding samples:  960
Binding samples:  192

 With CDR3b + pep + MHC allele:  0
Non-binding samples:  0
Binding samples:  0

 With CDR3b + pep + CDR3a allele:  0
Non-binding samples:  0
Binding samples:  0

 With CDR3b + pep + CDR3a + MHC allele:  0
Non-binding samples:  0
Binding samples:  0


# Combine the datasets

In [32]:
df = pd.concat([vdjdb_df, mcpas_df, iedb_pos_df, iedb_neg_df, mira_df]).reset_index(drop=True)
len(df)

202451

In [33]:
df = df.drop_duplicates(subset=["cdr3.alpha", "cdr3.beta", "mhc.a", "antigen.epitope", "label"], keep="first").reset_index(drop=True)
len(df)

182350

In [34]:
# Remark: we observe that certain samples are labelled in an inconsistent way
# This is an argument for not considering negative samples from IEDB. but only randomized negative samples
q = df[df.duplicated(subset=["cdr3.alpha", "cdr3.beta", "mhc.a", "antigen.epitope"], keep=False)]
print(len(q))
# Example:
q[q["antigen.epitope"] == "EVLPFFLFF"]

504


Unnamed: 0,cdr3.alpha,cdr3.beta,mhc.a,antigen.epitope,v.alpha,j.alpha,v.beta,d.beta,j.beta,label,negative.source
179630,,CASSHQAGALYNEQFF,HLA-A*29:02,EVLPFFLFF,,,,,,1,
181193,,CASSHQAGALYNEQFF,HLA-A*29:02,EVLPFFLFF,,,,,,0,iedb


# Fix the HLAs

In [35]:
# remove the "mutant"
mutant_dict = {}
for mhc in df['mhc.a'].unique():
    if type(mhc) == str and "mutant" in mhc:
        mutant_dict[mhc] = mhc.split(" ")[0].replace(",", "")
df['mhc.a'] = df['mhc.a'].apply(lambda x: mutant_dict[x] if x in mutant_dict.keys() else x)

In [36]:
# trim HLA resolution when > 4
import re

match = "\d\d:\d\d:\d\d"
higher_resolution_dict = {}
for mhc in df['mhc.a'].unique():
    if type(mhc) == str:
        r = re.search(match, mhc)
        if r:
            s = mhc.split(":")
            higher_resolution_dict[mhc] = s[0]+":"+s[1]

df['mhc.a'] = df['mhc.a'].apply(lambda x: higher_resolution_dict[x] if x in higher_resolution_dict.keys() else x)

In [37]:
# duplicate rows when ther are multiple HLAs
multiple_hlas_dict = {}
for mhc in df['mhc.a'].unique():
    if type(mhc) == str:
        if "," in mhc:
            multiple_hlas_dict[mhc] = mhc.replace(" ", "").split(",")

rows_to_drop = []
peps = []
cdr3bs = []
cdr3as = []
mhcs = []
labels = []
neg_source = []
v_a =[]
v_b =[]
j_a =[]
j_b =[]
d_b =[]

for index, row in df.iterrows():
    if row['mhc.a'] in multiple_hlas_dict.keys():
        rows_to_drop.append(index)
        for mhc in multiple_hlas_dict[row['mhc.a']]:
            mhcs.append(mhc)
            peps.append(row["antigen.epitope"])
            cdr3bs.append(row["cdr3.beta"])
            cdr3as.append(row["cdr3.alpha"])
            labels.append(row["label"])
            neg_source.append(row["negative.source"])
            v_a.append(row["v.alpha"])
            v_b.append(row["v.beta"])
            j_a.append(row["j.alpha"])
            j_b.append(row["j.beta"])
            d_b.append(row["d.beta"])

df = df.drop(rows_to_drop)

df_to_add = pd.DataFrame({
    "antigen.epitope": peps,
    "mhc.a": mhcs,
    "cdr3.beta": cdr3bs,
    "cdr3.alpha": cdr3as,
    "label": labels,
    "negative.source": neg_source,
    "v.alpha": v_a,
    "v.beta": v_b,
    "j.alpha": j_a,
    "j.beta": j_b,
    "d.beta": d_b
})

df = pd.concat([df, df_to_add]).reset_index(drop=True)

In [38]:
# at this point, try to normalize the MHC allele
from mhcnames.normalization import normalize_allele_name

def normalize(x):
    try:
        return normalize_allele_name(x)
    except:
        return np.nan

df["mhc.a"] = df["mhc.a"].apply(lambda x: normalize(x))

# Add the reference ("pseudo") sequences to the dataframe

In [39]:
class1_df = pd.read_csv(data_dir+'mhc-sequences/MHC_pseudo.dat', sep="\s+|\t+|\s+\t+|\t+\s+")
class1_df["mhc"] = class1_df["mhc"].apply(lambda x: normalize(x))
class1_df = class1_df.dropna()
class1_dict = dict(zip(class1_df.mhc, class1_df.sequence))

class2_df = pd.read_csv(data_dir+'mhc-sequences/pseudosequence.2016.all.X.dat', sep="\t")
class2_df["mhc"] = class2_df["mhc"].apply(lambda x: normalize(x))
class2_df = class2_df.dropna()
class2_dict = dict(zip(class2_df.mhc, class2_df.sequence))

mhc_2_seq = {**class1_dict, **class2_dict}
df["mhc.seq"] = df["mhc.a"].apply(lambda x: mhc_2_seq[x] if x in mhc_2_seq.keys() else np.nan)

df = df.reset_index(drop=True).drop_duplicates(subset=["cdr3.alpha", "cdr3.beta", "mhc.seq", "mhc.a", "antigen.epitope", "label"])

  """Entry point for launching an IPython kernel.


In [40]:
print_stats(df, mhc="seq")

print("\n NaN MHC allele info: ", sum(df["mhc.a"].isna()))
print("Good MHC allele info: ", sum(~df["mhc.a"].isna()))

print("\n NaN MHC sequences: ", sum(df["mhc.seq"].isna()))
print("Good MHC sequences: ", sum(~df["mhc.seq"].isna()))

print("\n NaN CDR3 alpha sequences: ", sum(df["cdr3.alpha"].isna()))
print("Good CDR3 alpha sequences: ", sum(~df["cdr3.alpha"].isna()))

Total samples: 178541

 With CDR3b + pep:  168635
Non-binding samples:  1907
Binding samples:  166728

 With CDR3b + pep + MHC seq:  77879
Non-binding samples:  178
Binding samples:  77701

 With CDR3b + pep + CDR3a seq:  28794
Non-binding samples:  100
Binding samples:  28694

 With CDR3b + pep + CDR3a + MHC seq:  28450
Non-binding samples:  92
Binding samples:  28358

 NaN MHC allele info:  89948
Good MHC allele info:  88593

 NaN MHC sequences:  94002
Good MHC sequences:  84539

 NaN CDR3 alpha sequences:  149544
Good CDR3 alpha sequences:  28997


# Add negative samples from NetTCR-2.0 paper

In [41]:
neg_ab_df = pd.concat([
    pd.read_csv(data_dir+"nettcr2-paper/train_ab_90_alpha.csv"),
    pd.read_csv(data_dir+"nettcr2-paper/train_ab_90_alphabeta.csv"),
    pd.read_csv(data_dir+"nettcr2-paper/train_ab_90_beta.csv"),
    pd.read_csv(data_dir+"nettcr2-paper/train_ab_95_alpha.csv"),
    pd.read_csv(data_dir+"nettcr2-paper/train_ab_95_alphabeta.csv"),
    pd.read_csv(data_dir+"nettcr2-paper/train_ab_95_beta.csv"),
])

neg_ab_df = neg_ab_df[["CDR3a", "CDR3b", "peptide", "binder"]]

neg_ab_df = neg_ab_df.rename(columns={
    "CDR3a":"cdr3.alpha", "CDR3b":"cdr3.beta", "peptide":"antigen.epitope", "binder":"label"
})

neg_ab_df = neg_ab_df[neg_ab_df["label"] == 0].drop_duplicates().reset_index(drop=True)

# no gene information in the NetTCR-2.0 neg samples
neg_ab_df["v.alpha"] = np.nan
neg_ab_df["v.beta"] = np.nan
neg_ab_df["j.alpha"] = np.nan
neg_ab_df["j.beta"] = np.nan
neg_ab_df["d.beta"] = np.nan

In [42]:
# if CDR3 alpha and CDR3 beta do not start with `C`, add it
neg_ab_df['cdr3.alpha'] = neg_ab_df['cdr3.alpha'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)
neg_ab_df['cdr3.beta'] = neg_ab_df['cdr3.beta'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)

# filter for unclear/PTM epitopes
neg_ab_df = neg_ab_df.loc[(neg_ab_df['antigen.epitope'].str.match('^[A-Z]+$') == True)].reset_index(drop=True)

In [43]:
neg_b_df = pd.concat([
    pd.read_csv(data_dir+"nettcr2-paper/train_beta_90.csv", sep=";"),
    pd.read_csv(data_dir+"nettcr2-paper/train_beta_92.csv", sep=";"),
    pd.read_csv(data_dir+"nettcr2-paper/train_beta_94.csv", sep=";"),
    pd.read_csv(data_dir+"nettcr2-paper/train_beta_99.csv", sep=";"),
])

neg_b_df["cdr3.alpha"] = np.nan

neg_b_df = neg_b_df[["CDR3", "peptide", "binder"]]

neg_b_df = neg_b_df.rename(columns={
    "CDR3":"cdr3.beta", "peptide":"antigen.epitope", "binder":"label"
})

neg_b_df = neg_b_df[neg_b_df["label"] == 0].drop_duplicates().reset_index(drop=True)

In [44]:
# if CDR3 beta do not start with `C`, add it
neg_b_df['cdr3.beta'] = neg_b_df['cdr3.beta'].apply(lambda x: 'C' + x if not pd.isnull(x) and x[0] != 'C' else x)

# filter for unclear/PTM epitopes
neg_b_df = neg_b_df.loc[(neg_b_df['antigen.epitope'].str.match('^[A-Z]+$') == True)].reset_index(drop=True)

In [45]:
neg_df = pd.concat([neg_ab_df, neg_b_df])
neg_df = neg_df.drop_duplicates().reset_index(drop=True)

In [46]:
# in the NetTCR-2.0 paper, authors state peptides are all specific for HLA-A*02:01
neg_df["mhc.a"] = "HLA-A*02:01"
neg_df["mhc.seq"] = neg_df["mhc.a"].apply(lambda x: mhc_2_seq[x])

In [47]:
# mark negative samples
neg_df["negative.source"] = "nettcr-2.0"

In [48]:
print_stats(neg_df)

Total samples: 146880

 With CDR3b + pep:  124814
Non-binding samples:  124814
Binding samples:  0

 With CDR3b + pep + MHC allele:  124814
Non-binding samples:  124814
Binding samples:  0

 With CDR3b + pep + CDR3a allele:  45470
Non-binding samples:  45470
Binding samples:  0

 With CDR3b + pep + CDR3a + MHC allele:  45470
Non-binding samples:  45470
Binding samples:  0


# Length filtrations

In [49]:
def length_filtrations(df):
    # only consider peptides whith length <= 16
    df['epitope.len'] = df['antigen.epitope'].apply(lambda x: len(x))
    df = df[df['epitope.len'] <= 16].drop(columns=['epitope.len'])

    # only consider CDR3 beta with 9 <= length <= 23
    df['cdr3b.len'] = df['cdr3.beta'].apply(lambda x: len(x))
    df = df[df['cdr3b.len'] >= 9]
    df = df[df['cdr3b.len'] <= 23].drop(columns=['cdr3b.len'])

    # only consider CDR3 alpha with with 7 <= length <= 21
    df['cdr3a.len'] = df['cdr3.alpha'].apply(lambda x: len(x) if type(x) is str else x)
    lower_mask = operator.or_(df['cdr3a.len'] >= 7, df['cdr3.alpha'].isna())
    df = df[lower_mask]
    upper_mask = operator.or_(df['cdr3a.len'] <= 20, df['cdr3.alpha'].isna())
    df = df[upper_mask]
    df = df.drop(columns=['cdr3a.len'])
    return df

In [50]:
df = length_filtrations(df)
neg_df = length_filtrations(neg_df)

# Add MHC (when NaN) using the IEDB ligand file

In [51]:
ligand = pd.read_csv(data_dir+'mhc-sequences/mhc_ligand_full.csv.zip')

# in the ligand file, mapping peptide <-> mhc might not be 1-to-1, we keep the first
ligand = ligand.drop_duplicates(subset=["Epitope.2"], keep="first")

ligand["MHC"] = ligand["MHC"].apply(lambda x: normalize(x))

ligand = ligand.dropna(subset=["Epitope.2", "MHC"])
pep_2_mhc_allele = dict(zip(ligand["Epitope.2"], ligand["MHC"]))

# add a column which specifies the origin of the MHC sequence
df["mhc.source"] = np.nan
df.loc[~df["mhc.a"].isna(), "mhc.source"] = "assay"

def infer_mhc_allele(x, pep_2_mhc_map):
    if type(x["mhc.a"]) != str:
        if x["antigen.epitope"] in pep_2_mhc_map.keys():
            return pep_2_mhc_map[x["antigen.epitope"]]
        else:
            return np.nan
    return x["mhc.a"]

def mhc_source(x, pep_2_mhc_map):
    if type(x["mhc.a"]) != str:
        if x["antigen.epitope"] in pep_2_mhc_map.keys():
            return "iedb-ligand-file"
        else:
            return np.nan
    return x["mhc.source"]

df["mhc.source"] = df.apply(mhc_source, pep_2_mhc_map=pep_2_mhc_allele, axis=1)
df["mhc.a"] = df.apply(infer_mhc_allele, pep_2_mhc_map=pep_2_mhc_allele, axis=1)
df["mhc.seq"] = df["mhc.a"].apply(lambda x: mhc_2_seq[x] if x in mhc_2_seq.keys() else np.nan)

df = df.drop_duplicates().reset_index(drop=True)

  interactivity=interactivity, compiler=compiler, result=result)


# Add negative samples via randomization

In [52]:
def sample_negatives(source_df):
    source_p_mhc = source_df[["antigen.epitope", "mhc.a", "mhc.seq"]]
    source_cdr3b = source_df["cdr3.beta"].dropna()
    source_cdr3a = source_df["cdr3.alpha"].dropna()
    
    beta_gene_df = source_df[["cdr3.beta", "j.beta", "v.beta", "d.beta"]].drop_duplicates()
    cdr3b_2_jb = dict(zip(beta_gene_df["cdr3.beta"], beta_gene_df["j.beta"]))
    cdr3b_2_vb = dict(zip(beta_gene_df["cdr3.beta"], beta_gene_df["v.beta"]))
    cdr3b_2_db = dict(zip(beta_gene_df["cdr3.beta"], beta_gene_df["d.beta"]))

    alpha_gene_df = source_df[["cdr3.alpha", "j.alpha", "v.alpha"]].drop_duplicates()
    cdr3a_2_ja = dict(zip(source_df["cdr3.alpha"], source_df["j.alpha"]))
    cdr3a_2_va = dict(zip(source_df["cdr3.alpha"], source_df["v.alpha"]))

    # sample negative samples, so that we have 2x negatives w.r.t. positives
    N = 2
    
    temp_df = pd.concat([
        source_p_mhc.sample(n=len(source_df), replace=False)
        for i in range(N)
    ])
    
    if len(source_cdr3a) > 0:
        temp_df["cdr3.alpha"] = np.concatenate([
            np.random.choice(source_cdr3a, len(source_df), replace=False)
            for i in range(N)
        ], axis=0)
    else:
        temp_df["cdr3.alpha"] = np.nan

    temp_df["cdr3.beta"] = np.concatenate([
        np.random.choice(source_cdr3b, len(source_df), replace=False)
        for i in range(N)
    ], axis=0)

    # this step ensures that the randomization did not create random samples,
    # which are equal to the positive ones
#     len_pre = len(temp_df)
#     temp = source_df[["antigen.epitope", "mhc.a", "mhc.seq", "cdr3.alpha", "cdr3.beta"]]
#     temp = source_df[["antigen.epitope", "mhc.a", "mhc.seq", "cdr3.beta"]]
#     temp = source_df[["antigen.epitope", "cdr3.beta"]]
#     temp_df = pd.merge(temp_df, temp, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
#     print(f"Dropping {len_pre - len(temp_df)} samples, which have positive (pep,CDR3b) pairs")

    # assign negative label
    temp_df["label"] = 0

    # mark negative samples
    temp_df["negative.source"] = "randomized"
    
    # assign V, D, J genes
    temp_df["v.alpha"] = temp_df["cdr3.alpha"].apply(lambda x: cdr3a_2_va.get(x, np.nan))
    temp_df["j.alpha"] = temp_df["cdr3.alpha"].apply(lambda x: cdr3a_2_ja.get(x, np.nan))
    temp_df["v.beta"] = temp_df["cdr3.beta"].apply(lambda x: cdr3b_2_vb[x])
    temp_df["j.beta"] = temp_df["cdr3.beta"].apply(lambda x: cdr3b_2_jb[x])
    temp_df["d.beta"] = temp_df["cdr3.beta"].apply(lambda x: cdr3b_2_db[x])

    return temp_df

In [53]:
samp_neg_df = []

# we only sample negatives starting from positive samples
pos_df = df[df["label"] == 1]

# case 1: cdr3 alpha and mhc are available
source_df = pos_df.dropna(subset=["cdr3.alpha", "mhc.seq"])
temp_df = sample_negatives(source_df)
samp_neg_df.append(temp_df)

# case 2: cdr3 alpha available, mhc unknown
source_df = pos_df.dropna(subset=["cdr3.alpha"])
source_df = source_df[source_df["mhc.seq"].isna()]
temp_df = sample_negatives(source_df)
samp_neg_df.append(temp_df)

# case 3: cdr3 alpha unknown, mhc is available
source_df = pos_df.dropna(subset=["mhc.seq"])
source_df = source_df[source_df["cdr3.alpha"].isna()]
temp_df = sample_negatives(source_df)
samp_neg_df.append(temp_df)

# case 3: cdr3 alpha unknown, mhc unknown
source_df = pos_df[pos_df["mhc.seq"].isna()]
source_df = source_df[source_df["cdr3.alpha"].isna()]
temp_df = sample_negatives(source_df)
samp_neg_df.append(temp_df)

samp_neg_df = pd.concat(samp_neg_df)
samp_neg_df = samp_neg_df.drop_duplicates(
    subset=['cdr3.alpha', 'cdr3.beta', 'mhc.a', 'antigen.epitope', 'label',
       'negative.source', 'mhc.seq']
).reset_index(drop=True)  # we exclude genes in checking for duplicates

# this step ensures that the randomization did not create random samples,
# which are equal to the positive ones
len_pre = len(samp_neg_df)
temp = df[["antigen.epitope", "cdr3.beta"]]
samp_neg_df = pd.merge(samp_neg_df, temp, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
print(f"Dropping {len_pre - len(samp_neg_df)} randomized negative samples, which present binding (pep,CDR3b) pairs.\n")

print_stats(samp_neg_df, "seq")

Dropping 31714 randomized negative samples, which present binding (pep,CDR3b) pairs.

Total samples: 265432

 With CDR3b + pep:  259171
Non-binding samples:  259171
Binding samples:  0

 With CDR3b + pep + MHC seq:  174311
Non-binding samples:  174311
Binding samples:  0

 With CDR3b + pep + CDR3a seq:  37061
Non-binding samples:  37061
Binding samples:  0

 With CDR3b + pep + CDR3a + MHC seq:  36452
Non-binding samples:  36452
Binding samples:  0


In [54]:
df = pd.concat([df, neg_df, samp_neg_df]).drop_duplicates().reset_index(drop=True)

# Save

In [55]:
df.to_csv(data_dir+"tc-hard/ds-vdj-genes.csv", index=False)

# Stats - considering full dataset

In [56]:
def check_duplicates(df, mhc="a"):
    print("Samples: ", len(df))

    q = df.drop_duplicates(subset=['cdr3.alpha', 'cdr3.beta', 'antigen.epitope', f'mhc.{mhc}'])
    print("Dropping duplicated p+b+a+mhc: ", len(q))
    p = df.drop_duplicates(subset=['cdr3.alpha', 'cdr3.beta', 'antigen.epitope', f'mhc.{mhc}', 'label'])
    print("Dropping duplicated p+b+a+mhc+label: ", len(p))
    
    q = df.drop_duplicates(subset=['cdr3.beta', 'antigen.epitope', f'mhc.{mhc}'])
    print("Dropping duplicated p+b+mhc: ", len(q))
    p = df.drop_duplicates(subset=['cdr3.beta', 'antigen.epitope', f'mhc.{mhc}', 'label'])
    print("Dropping duplicated p+b+mhc+label: ", len(p))
    
    
    q = df.drop_duplicates(subset=['cdr3.alpha', 'cdr3.beta', 'antigen.epitope'])
    print("Dropping duplicated p+b+a: ", len(q))
    p = df.drop_duplicates(subset=['cdr3.alpha', 'cdr3.beta', 'antigen.epitope', 'label'])
    print("Dropping duplicated p+b+a+label: ", len(p))

    q = df.drop_duplicates(subset=['cdr3.beta', 'antigen.epitope'])
    print("Dropping duplicated p+b: ", len(q))
    p = df.drop_duplicates(subset=['cdr3.beta', 'antigen.epitope', 'label'])
    print("Dropping duplicated p+b+label: ", len(p))

In [57]:
print_stats(df, mhc="seq")

print("\n NaN MHC allele info: ", sum(df["mhc.a"].isna()))
print("Good MHC allele info: ", sum(~df["mhc.a"].isna()))

print("\n NaN MHC sequences: ", sum(df["mhc.seq"].isna()))
print("Good MHC sequences: ", sum(~df["mhc.seq"].isna()))

print("\n NaN CDR3 alpha sequences: ", sum(df["cdr3.alpha"].isna()))
print("Good CDR3 alpha sequences: ", sum(~df["cdr3.alpha"].isna()))

Total samples: 566218

 With CDR3b + pep:  528020
Non-binding samples:  385776
Binding samples:  142244

 With CDR3b + pep + MHC seq:  400397
Non-binding samples:  300168
Binding samples:  100229

 With CDR3b + pep + CDR3a seq:  111041
Non-binding samples:  82631
Binding samples:  28410

 With CDR3b + pep + CDR3a + MHC seq:  110266
Non-binding samples:  82037
Binding samples:  28229

 NaN MHC allele info:  120244
Good MHC allele info:  445974

 NaN MHC sequences:  130817
Good MHC sequences:  435401

 NaN CDR3 alpha sequences:  454984
Good CDR3 alpha sequences:  111234


In [58]:
# here we accept inconsistent labelling
# as shown above there are some assays which present the same sequences, but with different labels
check_duplicates(df, mhc="seq")

Samples:  566218
Dropping duplicated p+b+a+mhc:  565882
Dropping duplicated p+b+a+mhc+label:  566148
Dropping duplicated p+b+mhc:  530802
Dropping duplicated p+b+mhc+label:  531143
Dropping duplicated p+b+a:  562817
Dropping duplicated p+b+a+label:  563054
Dropping duplicated p+b:  527707
Dropping duplicated p+b+label:  528020


# Stats - considering negatives only from negative assays

In [59]:
df_only_neg_assays = df[df["negative.source"] != "randomized"]

print_stats(df_only_neg_assays, mhc="seq")

print("\n NaN MHC allele info: ", sum(df_only_neg_assays["mhc.a"].isna()))
print("Good MHC allele info: ", sum(~df_only_neg_assays["mhc.a"].isna()))

print("\n NaN MHC sequences: ", sum(df_only_neg_assays["mhc.seq"].isna()))
print("Good MHC sequences: ", sum(~df_only_neg_assays["mhc.seq"].isna()))

print("\n NaN CDR3 alpha sequences: ", sum(df_only_neg_assays["cdr3.alpha"].isna()))
print("Good CDR3 alpha sequences: ", sum(~df_only_neg_assays["cdr3.alpha"].isna()))

Total samples: 300786

 With CDR3b + pep:  268961
Non-binding samples:  126717
Binding samples:  142244

 With CDR3b + pep + MHC seq:  226198
Non-binding samples:  125969
Binding samples:  100229

 With CDR3b + pep + CDR3a seq:  73980
Non-binding samples:  45570
Binding samples:  28410

 With CDR3b + pep + CDR3a + MHC seq:  73814
Non-binding samples:  45585
Binding samples:  28229

 NaN MHC allele info:  42217
Good MHC allele info:  258569

 NaN MHC sequences:  45873
Good MHC sequences:  254913

 NaN CDR3 alpha sequences:  226613
Good CDR3 alpha sequences:  74173


In [60]:
check_duplicates(df_only_neg_assays, mhc="seq")

Samples:  300786
Dropping duplicated p+b+a+mhc:  300496
Dropping duplicated p+b+a+mhc+label:  300762
Dropping duplicated p+b+mhc:  271714
Dropping duplicated p+b+mhc+label:  272055
Dropping duplicated p+b+a:  297456
Dropping duplicated p+b+a+label:  297693
Dropping duplicated p+b:  268648
Dropping duplicated p+b+label:  268961


# Stats - considering negatives only from randomization

In [61]:
df = df[df["negative.source"] != "mira"]
df = df[df["negative.source"] != "iedb"]
df = df[df["negative.source"] != "nettcr-2.0"]

print_stats(df, mhc="seq")

print("\n NaN MHC allele info: ", sum(df["mhc.a"].isna()))
print("Good MHC allele info: ", sum(~df["mhc.a"].isna()))

print("\n NaN MHC sequences: ", sum(df["mhc.seq"].isna()))
print("Good MHC sequences: ", sum(~df["mhc.seq"].isna()))

print("\n NaN CDR3 alpha sequences: ", sum(df["cdr3.alpha"].isna()))
print("Good CDR3 alpha sequences: ", sum(~df["cdr3.alpha"].isna()))

Total samples: 417390

 With CDR3b + pep:  401415
Non-binding samples:  259171
Binding samples:  142244

 With CDR3b + pep + MHC seq:  274540
Non-binding samples:  174311
Binding samples:  100229

 With CDR3b + pep + CDR3a seq:  65471
Non-binding samples:  37061
Binding samples:  28410

 With CDR3b + pep + CDR3a + MHC seq:  64681
Non-binding samples:  36452
Binding samples:  28229

 NaN MHC allele info:  119468
Good MHC allele info:  297922

 NaN MHC sequences:  130041
Good MHC sequences:  287349

 NaN CDR3 alpha sequences:  351754
Good CDR3 alpha sequences:  65636


In [62]:
check_duplicates(df, mhc="seq")

Samples:  417390
Dropping duplicated p+b+a+mhc:  417371
Dropping duplicated p+b+a+mhc+label:  417371
Dropping duplicated p+b+mhc:  404510
Dropping duplicated p+b+mhc+label:  404510
Dropping duplicated p+b+a:  414306
Dropping duplicated p+b+a+label:  414306
Dropping duplicated p+b:  401415
Dropping duplicated p+b+label:  401415


In [63]:
# check no (pep, CDR3b) pairs with both neg and pos labels
q = df[df.duplicated(subset=["antigen.epitope", "cdr3.beta", ], keep=False)]
p = q[q.label==1]
n = q[q.label==0]
p_b_inconsistent = pd.merge(p, n, on=["antigen.epitope", "cdr3.beta"], how='inner')
assert len(p_b_inconsistent) == 0

In [64]:
# considering positive samples + randomized negative samples (i.e. excluding real 
# negative samples), we want check that all CDR3b sequences which appear
# in the negative samples are also appearing in the positive ones
def check_no_beta_only_in_neg(df):
    # this check only concerns the positive + randomized negative samples
    # we exclude the real negatives
    t = df[df["negative.source"] != "mira"]
    t = t[t["negative.source"] != "iedb"]
    t = t[t["negative.source"] != "nettcr-2.0"]

    b_n = set(df_pep_b[df_pep_b.label==0]["cdr3.beta"].unique())
    b_p = set(df_pep_b[df_pep_b.label==1]["cdr3.beta"].unique())
    assert len(b_n - b_p) == 0

df_pep_b = df[["antigen.epitope", "cdr3.beta", "label", "negative.source"]].copy()
check_no_beta_only_in_neg(df_pep_b)

df_pep_b_mhc = df[["antigen.epitope", "cdr3.beta", "mhc.seq", "label", "negative.source"]].dropna().copy()
check_no_beta_only_in_neg(df_pep_b_mhc)

df_pep_b_a = df[["antigen.epitope", "cdr3.beta", "cdr3.alpha", "label", "negative.source"]].dropna().copy()
check_no_beta_only_in_neg(df_pep_b_a)

df_pep_b_a_mhc = df[["antigen.epitope", "cdr3.beta", "cdr3.alpha", "mhc.seq", "label", "negative.source"]].dropna().copy()
check_no_beta_only_in_neg(df_pep_b_a_mhc)