# TCRBert: Improves Prediction of Epitope-specific TCR recognition using Pre-trained Protein Embeddings.

## Global configurations

In [53]:
import logging
import logging.config
import os
import sys
import warnings
from enum import auto
import pandas as pd

rootdir = '/home/hym/trunk/TCRBert'
workdir = '%s/notebook' % rootdir
datadir = '%s/data' % rootdir
srcdir = '%s/tcrbert' % rootdir
outdir = '%s/output' % rootdir

os.chdir(workdir)

sys.path.append(rootdir)
sys.path.append(srcdir)

from tcrbert.dataset import CN, get_index

# Display
pd.set_option('display.max.rows', 999)
pd.set_option('display.max.columns', 999)

# Logger
warnings.filterwarnings('ignore')
logging.config.fileConfig('../config/logging.conf')
logger = logging.getLogger('tcrbert')

CN.values()
sys.path

['/home/hym/trunk/TCRBert/notebook',
 '/home/hym/trunk/TCRBert/notebook',
 '/home/hym/trunk/epidab',
 '/home/hym/trunk/epidab/epidab',
 '/home/hym/trunk/kvacc',
 '/home/hym/trunk/kvacc/kvacc',
 '/home/hym/trunk/lib',
 '/home/hym/trunk',
 '/home/hym/anaconda3/envs/py37-torch/lib/python37.zip',
 '/home/hym/anaconda3/envs/py37-torch/lib/python3.7',
 '/home/hym/anaconda3/envs/py37-torch/lib/python3.7/lib-dynload',
 '',
 '/home/hym/anaconda3/envs/py37-torch/lib/python3.7/site-packages',
 '/home/hym/anaconda3/envs/py37-torch/lib/python3.7/site-packages/IPython/extensions',
 '/home/hym/.ipython',
 '/home/hym/trunk/TCRBert',
 '/home/hym/trunk/TCRBert/tcrbert',
 '/home/hym/trunk/TCRBert',
 '/home/hym/trunk/TCRBert/tcrbert',
 '/home/hym/trunk/TCRBert',
 '/home/hym/trunk/TCRBert/tcrbert']

## Training Datasets

### Postitive dataset

#### Dash data
- containing epitope-specific paired epitope-specific paired TCRα and TCRβ chains for  three epitopes from humans and for seven epitopes from mice[{Dash:2017go}]

In [2]:
import pandas as pd
from IPython.core.display import display
from collections import OrderedDict

gene_info_map = OrderedDict({
    'BMLF': ('EBV', 'GLCTLVAML', 'HLA-A*02:01'),
    'pp65': ('CMV', 'NLVPMVATV', 'HLA-A*02:01'),
    'M1': ('IAV', 'GILGFVFTL', 'HLA-A*02:01'),
    'F2': ('IAV', 'LSLRNPILV', 'H2-Db'),
    'NP': ('IAV', 'ASNENMETM', 'H2-Db'),
    'PA': ('IAV', 'SSLENFRAYV', 'H2-Db'),
    'PB1': ('IAV', 'SSYRRPVGI', 'H2-Kb'),
    'm139': ('mCMV', 'TVYGFCLL', 'H2-Kb'),
    'M38': ('mCMV', 'SSPPMFRV', 'H2-Kb'),
    'M45': ('mCMV', 'HGIRNASFI', 'H2-Db'),   
})

fn_dash = '%s/Dash/human_mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv' % datadir
df = pd.read_table(fn_dash, sep='\t')
df = df.dropna(subset=['epitope', 'cdr3b'])
df[CN.epitope_gene] = df['epitope']
df[CN.epitope_species] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][0])
df[CN.epitope] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][1])
df[CN.mhc] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][2])
df[CN.species] = df['subject'].map(lambda x: 'human' if 'human' in x else 'mouse')
df[CN.cdr3b] = df['cdr3b'].str.strip()
df[CN.source] = 'Dash'
df[CN.label] = 1
df = df.loc[:, CN.values()]
df.index = df.apply(lambda row: get_index(row), axis=1)

df = df[~df.index.duplicated()]
display(df.head(), df.shape)

for i, gene in enumerate(gene_info_map.keys()):
    subdf = df[df[CN.epitope_gene] == gene]
    epitope = subdf[CN.epitope].iat[0]
    species = subdf[CN.epitope_species].iat[0]
    n_cdr3b = subdf.shape[0]
    print('%s: Epitope: %s, species: %s, gene: %s, n_cdr3b: %s' % (i + 1, epitope, species, gene, n_cdr3b))
    
df_dash = df

Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
GLCTLVAML_CASSLWTGSHEQYF,GLCTLVAML,BMLF,EBV,human,CASSLWTGSHEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSQSPGGEQYF,GLCTLVAML,BMLF,EBV,human,CASSQSPGGEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSLTTEQQFF,GLCTLVAML,BMLF,EBV,human,CASSLTTEQQFF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDATGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDATGNGYTF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDSTGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDSTGNGYTF,HLA-A*02:01,Dash,1


(1408, 8)

1: Epitope: GLCTLVAML, species: EBV, gene: BMLF, n_cdr3b: 54
2: Epitope: NLVPMVATV, species: CMV, gene: pp65, n_cdr3b: 54
3: Epitope: GILGFVFTL, species: IAV, gene: M1, n_cdr3b: 150
4: Epitope: LSLRNPILV, species: IAV, gene: F2, n_cdr3b: 102
5: Epitope: ASNENMETM, species: IAV, gene: NP, n_cdr3b: 149
6: Epitope: SSLENFRAYV, species: IAV, gene: PA, n_cdr3b: 230
7: Epitope: SSYRRPVGI, species: IAV, gene: PB1, n_cdr3b: 335
8: Epitope: TVYGFCLL, species: mCMV, gene: m139, n_cdr3b: 73
9: Epitope: SSPPMFRV, species: mCMV, gene: M38, n_cdr3b: 60
10: Epitope: HGIRNASFI, species: mCMV, gene: M45, n_cdr3b: 201


#### VDJdb

- We extend the positive dataset with new data set from VDJdb{Bagaev:2019hf}](https://vdjdb.cdr3.net, downloaded May 2021), which is a manually curated database that contains TCR sequences with known antigen specificity. Every entry in VDJdb has been given a confidence score between 0  and 3 (0: critical information missing, 1: medium confidence, 2: high confidence, 3: very high  confidence). 
- Confidence score가 죄소한 1이상인 entry를 선택하였다
- MHC-I restricted 에피토프들만 선택하였다.

In [3]:
fn_vdjdb = '%s/VDJdb/vdjdb_20210201.txt' % datadir
################################

df = pd.read_table(fn_vdjdb, sep='\t', header=0)
logger.debug('Current df.shape: %s' % str(df.shape))

# Select beta CDR3 sequence
logger.debug('Select beta CDR3 sequences and MHC-I restricted epitopes')
df = df[(df['gene'] == 'TRB') & (df['mhc.class'] == 'MHCI')]
logger.debug('Current df.shape: %s' % str(df.shape))

# Check valid CDR3 and peptide sequences
logger.debug('Select valid CDR3 and epitope sequences')
df = df.dropna(subset=['cdr3', 'antigen.epitope'])
logger.debug('Current df.shape: %s' % str(df.shape))

logger.debug('Select confidence score > 0')
df = df[df['vdjdb.score'].map(lambda score: score > 0)]
logger.debug('Current df.shape: %s' % str(df.shape))

df[CN.epitope] = df['antigen.epitope'].str.strip()
df[CN.epitope_species] = df['antigen.species']
df[CN.epitope_gene] = df['antigen.gene']
df[CN.species] = df['species']
df[CN.cdr3b] = df['cdr3'].str.strip()
# df[CN.mhc] = df['mhc.a'].map(lambda x: MHCAlleleName.sub_name(MHCAlleleName.std_name(x)))
df[CN.mhc] = df['mhc.a']
df[CN.source] = 'VDJdb'
df[CN.label] = 1

df.index = df.apply(lambda row: get_index(row), axis=1)
logger.debug('Drop duplicates with the same{epitope, CDR3b}')
df = df[~df.index.duplicated()]
logger.debug('Current df.shape: %s' % str(df.shape))

df = df.loc[:, CN.values()]

# logger.debug('Select all epitope with at least 30 TCRB sequences')
# tmp = df[CN.epitope].value_counts()
# tmp = tmp[tmp >= n_cdr3b_cutoff]
# df = df[df[CN.epitope].map(lambda x: x in tmp.index)]
# logger.debug('Final df.shape: %s' % str(df.shape))

display(df.head(), df.shape)

for i, (epitope, subdf) in enumerate(df.groupby([CN.epitope])):
    epitope_gene = subdf[CN.epitope_gene].unique()
    epitope_species = subdf[CN.epitope_species].iat[0]
    n_cdr3b = subdf.shape[0]

    print(
        '%s: Epitope: %s(%s), speices: %s, gene: %s, n_cdr3b: %s' %
        (i + 1, epitope, len(epitope), epitope_species, epitope_gene, n_cdr3b))

df_vdjdb = df

2021-05-23 00:52:08 [DEBUG]: Current df.shape: (78701, 21)
2021-05-23 00:52:08 [DEBUG]: Select beta CDR3 sequences and MHC-I restricted epitopes
2021-05-23 00:52:08 [DEBUG]: Current df.shape: (44797, 21)
2021-05-23 00:52:08 [DEBUG]: Select valid CDR3 and epitope sequences
2021-05-23 00:52:08 [DEBUG]: Current df.shape: (44797, 21)
2021-05-23 00:52:08 [DEBUG]: Select confidence score > 0
2021-05-23 00:52:08 [DEBUG]: Current df.shape: (6659, 21)
2021-05-23 00:52:08 [DEBUG]: Drop duplicates with the same{epitope, CDR3b}
2021-05-23 00:52:08 [DEBUG]: Current df.shape: (4275, 28)


Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
FLKEKGGL_CASSYLPGQGDHYSNQPQHF,FLKEKGGL,Nef,HIV-1,HomoSapiens,CASSYLPGQGDHYSNQPQHF,HLA-B*08,VDJdb,1
FLKEKGGL_CASSFEAGQGFFSNQPQHF,FLKEKGGL,Nef,HIV-1,HomoSapiens,CASSFEAGQGFFSNQPQHF,HLA-B*08,VDJdb,1
FLKEKGGL_CASSFEPGQGFYSNQPQHF,FLKEKGGL,Nef,HIV-1,HomoSapiens,CASSFEPGQGFYSNQPQHF,HLA-B*08,VDJdb,1
FLKEKGGL_CASSYEPGQVSHYSNQPQHF,FLKEKGGL,Nef,HIV-1,HomoSapiens,CASSYEPGQVSHYSNQPQHF,HLA-B*08,VDJdb,1
FLKEKGGL_CASSALASLNEQFF,FLKEKGGL,Nef,HIV-1,HomoSapiens,CASSALASLNEQFF,HLA-B*08,VDJdb,1


(4275, 8)

1: Epitope: AAFKRSCLK(9), speices: MCPyV, gene: ['T-Ag'], n_cdr3b: 5
2: Epitope: AAGIGILTV(9), speices: HomoSapiens, gene: ['MLANA'], n_cdr3b: 2
3: Epitope: ALDPHSGHFV(10), speices: HomoSapiens, gene: ['CDK4'], n_cdr3b: 4
4: Epitope: ALGIGILTV(9), speices: HomoSapiens, gene: ['MLANA'], n_cdr3b: 1
5: Epitope: ALSPVIPHI(9), speices: HomoSapiens, gene: ['MLL2'], n_cdr3b: 2
6: Epitope: ALTPVVVTL(9), speices: HomoSapiens, gene: ['CDK4'], n_cdr3b: 6
7: Epitope: ALWGPDPAAA(10), speices: HomoSapiens, gene: ['INS'], n_cdr3b: 1
8: Epitope: ALYGFVPVL(9), speices: HomoSapiens, gene: ['GANAB'], n_cdr3b: 6
9: Epitope: APARLERRHSA(11), speices: HomoSapiens, gene: ['KIF16B'], n_cdr3b: 3
10: Epitope: APRGPHGGAASGL(13), speices: HomoSapiens, gene: ['NY-ESO-1' 'NY-ESO'], n_cdr3b: 5
11: Epitope: ARMILMTHF(9), speices: HCV, gene: ['NS5B'], n_cdr3b: 14
12: Epitope: ASNENMETM(9), speices: InfluenzaA, gene: ['NP'], n_cdr3b: 69
13: Epitope: ATDALMTGY(9), speices: HCV, gene: ['NS3'], n_cdr3b: 135
14: Epitope: A

154: Epitope: NMMWFQGQL(9), speices: synthetic, gene: ['synthetic'], n_cdr3b: 2
155: Epitope: NVEYYDIKL(9), speices: HomoSapiens, gene: ['KLHL7'], n_cdr3b: 1
156: Epitope: QASQEVKNW(9), speices: HIV-1, gene: ['Gag'], n_cdr3b: 8
157: Epitope: QIKVRVDMV(9), speices: CMV, gene: ['IE1'], n_cdr3b: 7
158: Epitope: QIKVRVKMV(9), speices: CMV, gene: ['IE1'], n_cdr3b: 11
159: Epitope: QLCDVMFYL(9), speices: HomoSapiens, gene: ['BRAP'], n_cdr3b: 1
160: Epitope: QVPLRPMTYK(10), speices: HIV-1, gene: ['Nef'], n_cdr3b: 33
161: Epitope: QYDPVAALF(9), speices: CMV, gene: ['pp65'], n_cdr3b: 11
162: Epitope: RAKFKQLL(8), speices: EBV, gene: ['BZLF1'], n_cdr3b: 155
163: Epitope: RALEYKNL(8), speices: MCMV, gene: ['IE3'], n_cdr3b: 23
164: Epitope: RFPLTFGWCF(10), speices: HIV-1, gene: ['Nef'], n_cdr3b: 1
165: Epitope: RGYVYDGL(8), speices: VSV, gene: ['N'], n_cdr3b: 3
166: Epitope: RGYVYEGL(8), speices: VSV, gene: ['N'], n_cdr3b: 14
167: Epitope: RGYVYKGL(8), speices: VSV, gene: ['N'], n_cdr3b: 3
168: Ep

#### McPAS-TCR

- McPAS-TCR(http://friedmanlab.weizmann.ac.il/McPAS-TCR/) is a manually curated catalog of pathology-associated TCR sequences[{Tickotsky:2017bo}].
- MHC-I restricted 에피토프들을 선택하였다.

In [5]:
!head -n 1 ../data/McPAS/McPAS-TCR_20210521.csv

"CDR3.alpha.aa","CDR3.beta.aa","Species","Category","Pathology","Pathology.Mesh.ID","Additional.study.details","Antigen.identification.method","Single.cell","NGS","Antigen.protein","Protein.ID","Epitope.peptide","Epitope.ID","MHC","Tissue","T.Cell.Type","T.cell.characteristics","CDR3.alpha.nt","TRAV","TRAJ","TRBV","TRBD","TRBJ","Reconstructed.J.annotation","CDR3.beta.nt","Mouse.strain","PubMed.ID","Remarks"


In [6]:
import numpy as np

fn_mcpas = '%s/McPAS/McPAS-TCR_20210521.csv' % datadir
################################

df = pd.read_csv(fn_mcpas)
logger.debug('Current df.shape: %s' % str(df.shape))

# Select valid beta CDR3 sequence and epitope sequence
logger.debug('Select valid beta CDR3 and epitope sequences')
df = df.dropna(subset=['CDR3.beta.aa', 'Epitope.peptide'])
logger.debug('Current df.shape: %s' % str(df.shape))

df[CN.epitope] = df['Epitope.peptide'].str.strip()
df[CN.epitope_gene] = None
df[CN.epitope_species] = df['Pathology']
df[CN.species] = df['Species']
df[CN.cdr3b] = df['CDR3.beta.aa'].str.strip()
df[CN.mhc] = df['MHC'].str.strip()
df[CN.source] = 'McPAS'
df[CN.label] = 1

df.index = df.apply(lambda row: get_index(row), axis=1)

logger.debug('Select MHC-I restricted entries')
df = df[
    (df[CN.mhc].notnull()) &
    (np.logical_not(df[CN.mhc].str.contains('DR|DP|DQ')))
]
logger.debug('Current df.shape: %s' % str(df.shape))

logger.debug('Drop duplicates with the same{epitope, CDR3b}')
df = df[~df.index.duplicated()]
logger.debug('Current df.shape: %s' % str(df.shape))

df = df.loc[:, CN.values()]

# logger.debug('Select all epitope with at least 30 TCRB sequences')
# tmp = df[CN.epitope].value_counts()
# tmp = tmp[tmp >= n_cdr3b_cutoff]
# df = df[df[CN.epitope].map(lambda x: x in tmp.index)]
# logger.debug('Final df.shape: %s' % str(df.shape))

display(df.head(), df.shape)
for i, (epitope, subdf) in enumerate(df.groupby([CN.epitope])):
#     epitope_gene = ','.join(subdf[CN.epitope_gene].unique())
    epitope_species = subdf[CN.epitope_species].iat[0]
    n_cdr3b = subdf.shape[0]
    
    print('%s: Epitope: %s(%s), speices: %s, n_cdr3b: %s' % (i + 1, epitope, len(epitope), 
                                                             epitope_species, n_cdr3b))
    
df_mcpas = df

2021-05-23 00:52:48 [DEBUG]: Current df.shape: (39045, 29)
2021-05-23 00:52:48 [DEBUG]: Select valid beta CDR3 and epitope sequences
2021-05-23 00:52:48 [DEBUG]: Current df.shape: (15051, 29)
2021-05-23 00:52:48 [DEBUG]: Select MHC-I restricted entries
2021-05-23 00:52:48 [DEBUG]: Current df.shape: (13572, 37)
2021-05-23 00:52:48 [DEBUG]: Drop duplicates with the same{epitope, CDR3b}
2021-05-23 00:52:48 [DEBUG]: Current df.shape: (11101, 37)


Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
IKAVYNFATCG_CASSDAGANTEVF,IKAVYNFATCG,,Lymphocytic choriomeningitis virus (LCMV),Mouse,CASSDAGANTEVF,H-2db,McPAS,1
IKAVYNFATCG_CASSDAGAYAEQF,IKAVYNFATCG,,Lymphocytic choriomeningitis virus (LCMV),Mouse,CASSDAGAYAEQF,H-2db,McPAS,1
IKAVYNFATCG_CASSDAGGAAEVF,IKAVYNFATCG,,Lymphocytic choriomeningitis virus (LCMV),Mouse,CASSDAGGAAEVF,H-2db,McPAS,1
IKAVYNFATCG_CASSDAGHSPLYF,IKAVYNFATCG,,Lymphocytic choriomeningitis virus (LCMV),Mouse,CASSDAGHSPLYF,H-2db,McPAS,1
IKAVYNFATCG_CASSDAWGGAEQYF,IKAVYNFATCG,,Lymphocytic choriomeningitis virus (LCMV),Mouse,CASSDAWGGAEQYF,H-2db,McPAS,1


(11101, 8)

1: Epitope: AARAVFLAL(9), speices: Melanoma, n_cdr3b: 2
2: Epitope: ACASQKRPSQR(11), speices: Experimental autoimmune encephalomyelitis (EAE), n_cdr3b: 21
3: Epitope: ALIHHNTHL(9), speices: Neoantigen, n_cdr3b: 4
4: Epitope: ALIHHNTYL(9), speices: Neoantigen, n_cdr3b: 2
5: Epitope: ALLETLSLLL(10), speices: Neoantigen, n_cdr3b: 1
6: Epitope: ALLQVTLLL(9), speices: Neoantigen, n_cdr3b: 1
7: Epitope: ALSPVIPHI(9), speices: Neoantigen, n_cdr3b: 16
8: Epitope: ALSPVIPLI(9), speices: Neoantigen, n_cdr3b: 8
9: Epitope: ALSYTPAEV(9), speices: Neoantigen, n_cdr3b: 3
10: Epitope: ALTPVVVTL(9), speices: Acute myeloid leukemia, n_cdr3b: 6
11: Epitope: ALVGAIPSI(9), speices: Neoantigen, n_cdr3b: 1
12: Epitope: ALWGPDPAA(9), speices: Neoantigen, n_cdr3b: 1
13: Epitope: ALYGFVPVL(9), speices: Neoantigen, n_cdr3b: 24
14: Epitope: ALYGSVPVL(9), speices: Diabetes type 2, n_cdr3b: 7
15: Epitope: AMAGSLVFL(9), speices: Neoantigen, n_cdr3b: 3
16: Epitope: AMAGSPVFL(9), speices: Neoantigen, n_cdr3b: 1
17: E

#### Merge positive datasets

-  After removing duplicates with the same {epitope, CDR3} and selecting all epitope that have at least 30 TCRB CDR3 sequences, the positive dataset contains 12,229 combinations of CDR3b sequences and epitopes with unique 63 epitopes. Table 1 summarizes the positive dataset.

In [7]:
import pandas as pd

n_cdr3b_cutoff = 30
###################################################

logger.debug('Merge all datasets')
df = pd.concat([df_dash, df_vdjdb, df_mcpas])
logger.debug('Current df.shape: %s' % str(df.shape))

logger.debug('Drop duplicates')
df = df[~df.index.duplicated()]
logger.debug('Current df.shape: %s' % str(df.shape))

logger.debug('Select all epitope with at least 30 TCRB sequences')
tmp = df[CN.epitope].value_counts()
tmp = tmp[tmp >= n_cdr3b_cutoff]
df = df[df[CN.epitope].map(lambda x: x in tmp.index)]
logger.debug('Final df.shape: %s' % str(df.shape))

display(df.head(), df.shape)

for i, (epitope, subdf) in enumerate(df.groupby([CN.epitope])):
    epitope_gene = subdf[CN.epitope_gene].unique()
    source = subdf[CN.source].unique()
    epitope_species = subdf[CN.epitope_species].iat[0]
    n_cdr3b = subdf.shape[0]
    print('%s: Epitope: %s(%s), speices: %s, gene: %s, n_cdr3b: %s, source: %s' % (i + 1, epitope, len(epitope), 
                                                                       epitope_species, epitope_gene, n_cdr3b,
                                                                       source))

df_train_pos = df

2021-05-23 00:54:25 [DEBUG]: Merge all datasets
2021-05-23 00:54:25 [DEBUG]: Current df.shape: (16784, 8)
2021-05-23 00:54:25 [DEBUG]: Drop duplicates
2021-05-23 00:54:25 [DEBUG]: Current df.shape: (14031, 8)
2021-05-23 00:54:25 [DEBUG]: Select all epitope with at least 30 TCRB sequences
2021-05-23 00:54:25 [DEBUG]: Final df.shape: (12229, 8)


Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
GLCTLVAML_CASSLWTGSHEQYF,GLCTLVAML,BMLF,EBV,human,CASSLWTGSHEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSQSPGGEQYF,GLCTLVAML,BMLF,EBV,human,CASSQSPGGEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSLTTEQQFF,GLCTLVAML,BMLF,EBV,human,CASSLTTEQQFF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDATGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDATGNGYTF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDSTGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDSTGNGYTF,HLA-A*02:01,Dash,1


(12229, 8)

1: Epitope: ALYGFVPVL(9), speices: HomoSapiens, gene: ['GANAB' None], n_cdr3b: 30, source: ['VDJdb' 'McPAS']
2: Epitope: ASNENMETM(9), speices: IAV, gene: ['NP' None], n_cdr3b: 285, source: ['Dash' 'VDJdb' 'McPAS']
3: Epitope: ATDALMTGY(9), speices: HCV, gene: ['NS3' None], n_cdr3b: 153, source: ['VDJdb' 'McPAS']
4: Epitope: CINGVCWTV(9), speices: HCV, gene: ['NS3' None], n_cdr3b: 43, source: ['VDJdb' 'McPAS']
5: Epitope: CRVLCCYVL(9), speices: CMV, gene: ['IE-1' None], n_cdr3b: 435, source: ['VDJdb' 'McPAS']
6: Epitope: CTPYDINQM(9), speices: SIV, gene: ['Gag'], n_cdr3b: 152, source: ['VDJdb']
7: Epitope: EAAGIGILTV(10), speices: HomoSapiens, gene: ['MLANA' None], n_cdr3b: 283, source: ['VDJdb' 'McPAS']
8: Epitope: EIYKRWII(8), speices: HIV-1, gene: ['Gag' None], n_cdr3b: 94, source: ['VDJdb' 'McPAS']
9: Epitope: ELAGIGILTV(10), speices: HomoSapiens, gene: ['MLANA' None], n_cdr3b: 288, source: ['VDJdb' 'McPAS']
10: Epitope: EPLPQGQLTAY(11), speices: EBV, gene: ['BZLF1' None], n_cdr3b:

In [8]:
12229 * 2

24458

### Negative dataset

- To increase the specificity of the predictive model, negative examples wich we do not expect to interact between TCRs and epitopes were added.
- The negative examples were made by combining the peptides from the positive dataset with the randomly selected background TCR CDR3 beta sequences constructed by Dash et al[{Dash:2017go}] which have been collected from two healty donors[{Howie:2015dc}]
- 예측 모델의 학습에서의 Overfitting을 피하기 위해, trainging 데이터셋은 epitope 별로 같은 수의 positive/negative data point를 포함하도록 하였다. 최종 Training 데이터셋은 24,458개의 TCR CDR3b-peptide combinations를 포함하고 있다(Table S1)

In [54]:
fn_cntr = '%s/TCRGP/human_tcr_control.csv' % datadir
################################
df_cntr = pd.read_csv(fn_cntr)
display(df_cntr.head())
pos_cdr3b = df_train_pos[CN.cdr3b].unique()
neg_cdr3b = list(filter(lambda x: x not in pos_cdr3b, df_cntr['cdr3b'].unique()))

print('len(pos_cdr3b): %s, len(neg_cdr3b): %s' % (len(pos_cdr3b), len(neg_cdr3b)))

df = pd.DataFrame(columns=CN.values())
for epitope, subdf in df_train_pos.groupby([CN.epitope]):
    subdf_neg = subdf.copy()
    subdf_neg[CN.source] = 'Control'
    subdf_neg[CN.label] = 0
    subdf_neg[CN.cdr3b] = np.random.choice(neg_cdr3b, subdf.shape[0], replace=False)
    subdf_neg.index = subdf_neg.apply(lambda row: get_index(row), axis=1)
    df = df.append(subdf_neg)

display(df.head(), df.shape)    
df_train_neg = df

Unnamed: 0,va_reps,vb_reps,cdr3a,cdr3b
0,TRAV9-2*01,TRBV7-9*01,CALSPGGTSYGKLTF,CASSPATGGLVDTQYF
1,TRAV9-2*01,TRBV20/OR9-2*01;TRBV20-1*01,CALKGYTGRRALTF,CSAREGLAGDYEQYF
2,TRAV9-2*01,TRBV5-1*01,CALSTGANSKLTF,CASRRALAGADEQYF
3,TRAV6*01,TRBV27*01,CALDMRSGAGSYQLTF,CASRPFQVSTDTQYF
4,TRAV29/DV5*01,TRBV7-6*01,CAAPFKGGSEKLVF,CASSYSPHNSPLHF


len(pos_cdr3b): 11730, len(neg_cdr3b): 8045


Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
ALYGFVPVL_CASSPRGLAVYNEQFF,ALYGFVPVL,GANAB,HomoSapiens,HomoSapiens,CASSPRGLAVYNEQFF,HLA-A*02:01,Control,0
ALYGFVPVL_CASSLETPGQGLVAYEQYF,ALYGFVPVL,GANAB,HomoSapiens,HomoSapiens,CASSLETPGQGLVAYEQYF,HLA-A*02:01,Control,0
ALYGFVPVL_CASSLSGRANEQFF,ALYGFVPVL,GANAB,HomoSapiens,HomoSapiens,CASSLSGRANEQFF,HLA-A*02:01,Control,0
ALYGFVPVL_CASSLGTEGYTF,ALYGFVPVL,GANAB,HomoSapiens,HomoSapiens,CASSLGTEGYTF,HLA-A*02:01,Control,0
ALYGFVPVL_CASSYAGQPSSGNTIYF,ALYGFVPVL,GANAB,HomoSapiens,HomoSapiens,CASSYAGQPSSGNTIYF,HLA-A*02:01,Control,0


(12229, 8)

In [58]:
df_train = pd.concat([df_train_pos, df_train_neg])

display(df_train.head(), df_train.shape)
print('Duplicated count: %s' % np.count_nonzero(df_train.index.duplicated()))

df_train.to_csv('%s/train.csv' % outdir)


Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
GLCTLVAML_CASSLWTGSHEQYF,GLCTLVAML,BMLF,EBV,human,CASSLWTGSHEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSQSPGGEQYF,GLCTLVAML,BMLF,EBV,human,CASSQSPGGEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSLTTEQQFF,GLCTLVAML,BMLF,EBV,human,CASSLTTEQQFF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDATGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDATGNGYTF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDSTGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDSTGNGYTF,HLA-A*02:01,Dash,1


(24458, 8)

Duplicated count: 0
