# TCRBert: Improves Prediction of Epitope-specific TCR recognition using Pre-trained Protein Embeddings.

## Global configurations

In [46]:
import logging
import logging.config
import os
import warnings
from enum import auto
from tcrbert.commons import StrEnum
from tcrbert.dataset import CN, get_index

workdir = '/home/hym/trunk/TCRBert'
datadir = '%s/data' % workdir
outdir = '%s/output' % workdir

os.chdir(workdir)

# Logger
warnings.filterwarnings('ignore')
logging.config.fileConfig('config/logging.conf')
logger = logging.getLogger('tcrbert')

CN.values()

['epitope',
 'epitope_gene',
 'epitope_species',
 'species',
 'cdr3b',
 'mhc',
 'source',
 'label']

## Training Datasets

### Postitive dataset

#### Dash data
- containing epitope-specific paired epitope-specific paired TCRα and TCRβ chains for  three epitopes from humans and for seven epitopes from mice[{Dash:2017go}]

In [43]:
import pandas as pd
from IPython.core.display import display
from collections import OrderedDict

gene_info_map = OrderedDict({
    'BMLF': ('EBV', 'GLCTLVAML', 'HLA-A*02:01'),
    'pp65': ('CMV', 'NLVPMVATV', 'HLA-A*02:01'),
    'M1': ('IAV', 'GILGFVFTL', 'HLA-A*02:01'),
    'F2': ('IAV', 'LSLRNPILV', 'H2-Db'),
    'NP': ('IAV', 'ASNENMETM', 'H2-Db'),
    'PA': ('IAV', 'SSLENFRAYV', 'H2-Db'),
    'PB1': ('IAV', 'SSYRRPVGI', 'H2-Kb'),
    'm139': ('mCMV', 'TVYGFCLL', 'H2-Kb'),
    'M38': ('mCMV', 'SSPPMFRV', 'H2-Kb'),
    'M45': ('mCMV', 'HGIRNASFI', 'H2-Db'),   
})

fn_dash = 'data/Dash/human_mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
df = pd.read_table(fn_dash, sep='\t')
df = df.dropna(subset=['epitope', 'cdr3b'])
df[CN.epitope_gene] = df['epitope']
df[CN.epitope_species] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][0])
df[CN.epitope] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][1])
df[CN.mhc] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][2])
df[CN.species] = df['subject'].map(lambda x: 'human' if 'human' in x else 'mouse')
df[CN.cdr3b] = df['cdr3b'].str.strip()
df[CN.source] = 'Dash'
df[CN.label] = 1
df = df.loc[:, CN.values()]
df.index = df.apply(lambda row: get_index(row), axis=1)

df = df[~df.index.duplicated()]
display(df.head())

for gene in gene_info_map.keys():
    subdf = df[df[CN.epitope_gene] == gene]
    epitope = subdf[CN.epitope].iat[0]
    n_cdr3b = subdf.shape[0]
    print('epitope gene: %s, epitope: %s, n_cdr3b: %s' % (gene, epitope, n_cdr3b))
    
df_dash = df

Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
GLCTLVAML_CASSLWTGSHEQYF,GLCTLVAML,BMLF,EBV,human,CASSLWTGSHEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSQSPGGEQYF,GLCTLVAML,BMLF,EBV,human,CASSQSPGGEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSLTTEQQFF,GLCTLVAML,BMLF,EBV,human,CASSLTTEQQFF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDATGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDATGNGYTF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDSTGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDSTGNGYTF,HLA-A*02:01,Dash,1


epitope gene: BMLF, epitope: GLCTLVAML, n_cdr3b: 54
epitope gene: pp65, epitope: NLVPMVATV, n_cdr3b: 54
epitope gene: M1, epitope: GILGFVFTL, n_cdr3b: 150
epitope gene: F2, epitope: LSLRNPILV, n_cdr3b: 102
epitope gene: NP, epitope: ASNENMETM, n_cdr3b: 149
epitope gene: PA, epitope: SSLENFRAYV, n_cdr3b: 230
epitope gene: PB1, epitope: SSYRRPVGI, n_cdr3b: 335
epitope gene: m139, epitope: TVYGFCLL, n_cdr3b: 73
epitope gene: M38, epitope: SSPPMFRV, n_cdr3b: 60
epitope gene: M45, epitope: HGIRNASFI, n_cdr3b: 201


#### VDJdb

- We extend the positive dataset with new data set from VDJdb{Bagaev:2019hf}](https://vdjdb.cdr3.net, downloaded May 2021), which is a manually curated database that contains TCR sequences with known antigen specificity. Every entry in VDJdb has been given a confidence score between 0  and 3 (0: critical information missing, 1: medium confidence, 2: high confidence, 3: very high  confidence). We selected all epitopes that have at least 30 TCRβ sequences with a confidence score at least 1 and found 38 unique epitopes.

In [63]:
fn_vdjdb = 'data/VDJdb/VDJ.tsv'
n_cdr3b_cutoff = 30
################################

df = pd.read_table(fn_vdjdb, sep='\t', header=0)
logger.debug('Current df.shape: %s' % str(df.shape))

# Select beta CDR3 sequence
logger.debug('Select beta CDR3 sequence')
df = df[df['Gene'] == 'TRB']
logger.debug('Current df.shape: %s' % str(df.shape))

# Check valid CDR3 and peptide sequences
logger.debug('Select valid CDR3 and epitope sequences')
df = df.dropna(subset=['CDR3', 'Epitope'])
logger.debug('Current df.shape: %s' % str(df.shape))

logger.debug('Select confidence score > 0')
df = df[
    df['Score'].map(lambda score: score > 0)
]
logger.debug('Current df.shape: %s' % str(df.shape))

df[CN.epitope] = df['Epitope'].str.strip()
df[CN.epitope_species] = df['Epitope species']
df[CN.epitope_gene] = df['Epitope gene']
df[CN.species] = df['Species']
df[CN.cdr3b] = df['CDR3'].str.strip()
df[CN.mhc] = df['MHC A']
df[CN.source] = 'VDJdb'
df[CN.label] = 1

df.index = df.apply(lambda row: get_index(row), axis=1)
logger.debug('Drop duplicates with the same{epitope, CDR3b}')
df = df[~df.index.duplicated()]
logger.debug('Current df.shape: %s' % str(df.shape))

df = df.loc[:, CN.values()]

logger.debug('Select all epitope with at least 30 TCRB sequences')
tmp = df[CN.epitope].value_counts()
tmp = tmp[tmp >= n_cdr3b_cutoff]
df = df[df[CN.epitope].map(lambda x: x in tmp.index)]
logger.debug('Final df.shape: %s' % str(df.shape))

display(df.head())

for i, (epitope, subdf) in enumerate(df.groupby([CN.epitope])):
    epitope_gene = ','.join(subdf[CN.epitope_gene].unique())
    epitope_species = subdf[CN.epitope_species].iat[0]
    n_cdr3b = subdf.shape[0]
    
    print('%s. Epitope: %s, speices: %s, gene: %s, n_cdr3b: %s' % (i + 1, epitope, epitope_species, epitope_gene, n_cdr3b))
    
df_vdjdb = df

2021-05-21 05:22:18 [DEBUG]: Current df.shape: (68039, 17)
2021-05-21 05:22:18 [DEBUG]: Select beta CDR3 sequence
2021-05-21 05:22:18 [DEBUG]: Current df.shape: (43745, 17)
2021-05-21 05:22:18 [DEBUG]: Select valid CDR3 and epitope sequences
2021-05-21 05:22:18 [DEBUG]: Current df.shape: (43745, 17)
2021-05-21 05:22:18 [DEBUG]: Select confidence score > 0
2021-05-21 05:22:18 [DEBUG]: Current df.shape: (5964, 17)
2021-05-21 05:22:18 [DEBUG]: Drop duplicates with the same{epitope, CDR3b}
2021-05-21 05:22:18 [DEBUG]: Current df.shape: (3881, 25)
2021-05-21 05:22:18 [DEBUG]: Select all epitope with at least 30 TCRB sequences
2021-05-21 05:22:18 [DEBUG]: Final df.shape: (2991, 8)


Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
GLCTLVAML_CASSSGQLTNTEAFF,GLCTLVAML,BMLF1,EBV,HomoSapiens,CASSSGQLTNTEAFF,HLA-A*02:01,VDJdb,1
GLCTLVAML_CSARDRTGNGYTF,GLCTLVAML,BMLF1,EBV,HomoSapiens,CSARDRTGNGYTF,HLA-A*02:01,VDJdb,1
GLCTLVAML_CSARGDGQGDLLQETQYF,GLCTLVAML,BMLF1,EBV,HomoSapiens,CSARGDGQGDLLQETQYF,HLA-A*02:01,VDJdb,1
GLCTLVAML_CSVGTGGTNEKLFF,GLCTLVAML,BMLF1,EBV,HomoSapiens,CSVGTGGTNEKLFF,HLA-A*02:01,VDJdb,1
GLCTLVAML_CSVGSGGTNEKLFF,GLCTLVAML,BMLF1,EBV,HomoSapiens,CSVGSGGTNEKLFF,HLA-A*02:01,VDJdb,1


1. Epitope: ASNENMETM, speices: InfluenzaA, gene: NP, n_cdr3b: 57
2. Epitope: ATDALMTGY, speices: HCV, gene: NS3, n_cdr3b: 135
3. Epitope: CINGVCWTV, speices: HCV, gene: NS3, n_cdr3b: 39
4. Epitope: CTPYDINQM, speices: SIV, gene: Gag, n_cdr3b: 148
5. Epitope: EIYKRWII, speices: HIV-1, gene: Gag, n_cdr3b: 60
6. Epitope: FLKEKGGL, speices: HIV-1, gene: Nef, n_cdr3b: 78
7. Epitope: FPRPWLHGL, speices: HIV-1, gene: Vpr, n_cdr3b: 30
8. Epitope: FRDYVDRFYKTLRAEQASQE, speices: HIV-1, gene: Gag, n_cdr3b: 95
9. Epitope: GILGFVFTL, speices: InfluenzaA, gene: M, n_cdr3b: 146
10. Epitope: GLCTLVAML, speices: EBV, gene: BMLF1, n_cdr3b: 158
11. Epitope: GPGHKARVL, speices: HIV-1, gene: Gag, n_cdr3b: 53
12. Epitope: GTSGSPIINR, speices: DENV3/4, gene: NS3, n_cdr3b: 46
13. Epitope: GTSGSPIVNR, speices: DENV1, gene: NS3, n_cdr3b: 59
14. Epitope: HGIRNASFI, speices: MCMV, gene: M45, n_cdr3b: 71
15. Epitope: HSKKKCDEL, speices: HCV, gene: NS3, n_cdr3b: 30
16. Epitope: IPSINVHHY, speices: CMV, gene: pp65,

#### McPAS-TCR

- McPAS-TCR(http://friedmanlab.weizmann.ac.il/McPAS-TCR/) is a manually curated catalog of pathology-associated TCR sequences[{Tickotsky:2017bo}].

In [68]:
fn_mcpas = 'data/McPAS/McPAS-TCR.csv'
n_cdr3b_cutoff = 30
################################
df = pd.read_csv(fn_mcpas)
logger.debug('Current df.shape: %s' % str(df.shape))

# Select valid beta CDR3 sequence and epitope sequence
logger.debug('Select valid beta CDR3 and epitope sequences')
df = df.dropna(subset=['CDR3.beta.aa', 'Epitope.peptide'])
logger.debug('Current df.shape: %s' % str(df.shape))

df[CN.epitope] = df['Epitope.peptide']
df[CN.epitope_gene] = None
df[CN.epitope_species] = df['Pathology']
df[CN.species] = df['Species']
df[CN.cdr3b] = df['CDR3.beta.aa']
df[CN.mhc] = df['MHC']
df[CN.source] = 'McPAS'
df[CN.label] = 1

df.index = df.apply(lambda row: get_index(row), axis=1)

logger.debug('Drop duplicates with the same{epitope, CDR3b}')
df = df[~df.index.duplicated()]
logger.debug('Current df.shape: %s' % str(df.shape))

df = df.loc[:, CN.values()]

logger.debug('Select all epitope with at least 30 TCRB sequences')
tmp = df[CN.epitope].value_counts()
tmp = tmp[tmp >= n_cdr3b_cutoff]
df = df[df[CN.epitope].map(lambda x: x in tmp.index)]
logger.debug('Final df.shape: %s' % str(df.shape))

display(df.head())
for i, (epitope, subdf) in enumerate(df.groupby([CN.epitope])):
#     epitope_gene = ','.join(subdf[CN.epitope_gene].unique())
    epitope_species = subdf[CN.epitope_species].iat[0]
    n_cdr3b = subdf.shape[0]
    
    print('%s. Epitope: %s, speices: %s, n_cdr3b: %s' % (i + 1, epitope, epitope_species, n_cdr3b))
    
df_mcpas = df

2021-05-21 17:55:12 [DEBUG]: Current df.shape: (21689, 29)
2021-05-21 17:55:12 [DEBUG]: Select valid beta CDR3 and epitope sequences
2021-05-21 17:55:12 [DEBUG]: Current df.shape: (14583, 29)
2021-05-21 17:55:12 [DEBUG]: Drop duplicates with the same{epitope, CDR3b}
2021-05-21 17:55:12 [DEBUG]: Current df.shape: (12087, 37)
2021-05-21 17:55:12 [DEBUG]: Select all epitope with at least 30 TCRB sequences
2021-05-21 17:55:12 [DEBUG]: Final df.shape: (10555, 8)


Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
EAAGIGILTV_CASSLGNEQF,EAAGIGILTV,,Melanoma,Human,CASSLGNEQF,HLA-A*02,McPAS,1
EAAGIGILTV_CASSLGVATGELF,EAAGIGILTV,,Melanoma,Human,CASSLGVATGELF,HLA-A*02,McPAS,1
EAAGIGILTV_CASSQEEGGGSWGNTIYF,EAAGIGILTV,,Melanoma,Human,CASSQEEGGGSWGNTIYF,HLA-A*02,McPAS,1
EAAGIGILTV_CASSQEGLAGASQYF,EAAGIGILTV,,Melanoma,Human,CASSQEGLAGASQYF,HLA-A*02,McPAS,1
EAAGIGILTV_CASSQETDIVFNOPQHF,EAAGIGILTV,,Melanoma,Human,CASSQETDIVFNOPQHF,HLA-A*02,McPAS,1


1. Epitope: ASNENMETM, speices: Influenza, n_cdr3b: 265
2. Epitope: ATDALMTGY, speices: Hepatitis C virus, n_cdr3b: 52
3. Epitope: CRVLCCYVL, speices: Cytomegalovirus (CMV), n_cdr3b: 435
4. Epitope: EAAGIGILTV, speices: Melanoma, n_cdr3b: 273
5. Epitope: EIYKRWII, speices: Human immunodeficiency virus (HIV), n_cdr3b: 36
6. Epitope: ELAGIGILTV, speices: Melanoma, n_cdr3b: 169
7. Epitope: FPRPWLHGL, speices: Human immunodeficiency virus (HIV), n_cdr3b: 88
8. Epitope: FRCPRRFCF, speices: Cytomegalovirus (CMV), n_cdr3b: 266
9. Epitope: GILGFVFTL, speices: Influenza, n_cdr3b: 1148
10. Epitope: GLCTLVAML, speices: Epstein Barr virus (EBV), n_cdr3b: 828
11. Epitope: HGIRNASFI, speices: mCMV, n_cdr3b: 195
12. Epitope: HPKVSSEVHI, speices: Human immunodeficiency virus (HIV), n_cdr3b: 54
13. Epitope: IIKDYGKQM, speices: Human immunodeficiency virus (HIV), n_cdr3b: 36
14. Epitope: KAFSPEVIPMF, speices: Human immunodeficiency virus (HIV), n_cdr3b: 123
15. Epitope: KMVAVFYTT, speices: Neoantigen, n