# TCRBert: Improves Prediction of Epitope-specific TCR recognition using Pre-trained Protein Embeddings.

## Global configurations

In [31]:
import logging
import logging.config
import os
import warnings
from enum import auto
from tcrbert.commons import StrEnum

workdir = '/home/hym/trunk/TCRBert'
datadir = '%s/data' % workdir
outdir = '%s/output' % workdir

os.chdir(workdir)

# Logger
warnings.filterwarnings('ignore')
logging.config.fileConfig('config/logging.conf')
logger = logging.getLogger('tcrbert')

class ColumnName(StrEnum):


## Training Datasets

### Postitive dataset

#### Dash data
- containing epitope-specific paired epitope-specific paired TCRα and TCRβ chains for  three epitopes from humans and for seven epitopes from mice[{Dash:2017go}]

In [40]:
import pandas as pd
from IPython.core.display import display
from collections import OrderedDict

gene_info_map = OrderedDict({
    'BMLF': ('EBV', 'GLCTLVAML', 'HLA-A*02:01'),
    'pp65': ('CMV', 'NLVPMVATV', 'HLA-A*02:01'),
    'M1': ('IAV', 'GILGFVFTL', 'HLA-A*02:01'),
    'F2': ('IAV', 'LSLRNPILV', 'H2-Db'),
    'NP': ('IAV', 'ASNENMETM', 'H2-Db'),
    'PA': ('IAV', 'SSLENFRAYV', 'H2-Db'),
    'PB1': ('IAV', 'SSYRRPVGI', 'H2-Kb'),
    'm139': ('mCMV', 'TVYGFCLL', 'H2-Kb'),
    'M38': ('mCMV', 'SSPPMFRV', 'H2-Kb'),
    'M45': ('mCMV', 'HGIRNASFI', 'H2-Db'),   
})

df = pd.read_table('data/Dash/human_mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv', sep='\t')
df = df.dropna(subset=['epitope', 'cdr3b'])
df[CN.epitope_gene] = df['epitope']
df[CN.epitope_species] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][0])
df[CN.epitope] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][1])
df[CN.mhc] = df[CN.epitope_gene].map(lambda x: gene_info_map[x][2])
df[CN.species] = df['subject'].map(lambda x: 'human' if 'human' in x else 'mouse')
df[CN.cdr3b] = df['cdr3b'].str.strip()
df[CN.source] = 'Dash'
df[CN.label] = 1
df = df.loc[:, CN.values()]
df.index = df.apply(lambda row: get_index(row), axis=1)

df = df[~df.index.duplicated()]
display(df.head())

for gene in gene_info_map.keys():
    subtab = df[df[CN.epitope_gene] == gene]
    epitope = subtab[CN.epitope].iat[0]
    n_cdr3b = subtab.shape[0]
    print('epitope gene: %s, epitope: %s, n_cdr3b: %s' % (gene, epitope, n_cdr3b))
    
df_dash = df

Unnamed: 0,epitope,epitope_gene,epitope_species,species,cdr3b,mhc,source,label
GLCTLVAML_CASSLWTGSHEQYF,GLCTLVAML,BMLF,EBV,human,CASSLWTGSHEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSQSPGGEQYF,GLCTLVAML,BMLF,EBV,human,CASSQSPGGEQYF,HLA-A*02:01,Dash,1
GLCTLVAML_CASSLTTEQQFF,GLCTLVAML,BMLF,EBV,human,CASSLTTEQQFF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDATGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDATGNGYTF,HLA-A*02:01,Dash,1
GLCTLVAML_CSARDSTGNGYTF,GLCTLVAML,BMLF,EBV,human,CSARDSTGNGYTF,HLA-A*02:01,Dash,1


epitope gene: BMLF, epitope: GLCTLVAML, n_cdr3b: 54
epitope gene: pp65, epitope: NLVPMVATV, n_cdr3b: 54
epitope gene: M1, epitope: GILGFVFTL, n_cdr3b: 150
epitope gene: F2, epitope: LSLRNPILV, n_cdr3b: 102
epitope gene: NP, epitope: ASNENMETM, n_cdr3b: 149
epitope gene: PA, epitope: SSLENFRAYV, n_cdr3b: 230
epitope gene: PB1, epitope: SSYRRPVGI, n_cdr3b: 335
epitope gene: m139, epitope: TVYGFCLL, n_cdr3b: 73
epitope gene: M38, epitope: SSPPMFRV, n_cdr3b: 60
epitope gene: M45, epitope: HGIRNASFI, n_cdr3b: 201


#### VDJdb

- We extend the positive dataset with new data set from VDJdb{Bagaev:2019hf}](https://vdjdb.cdr3.net, downloaded May 2021), which is a manually curated database that contains TCR sequences with known antigen specificity. Every entry in VDJdb has been given a confidence score between 0  and 3 (0: critical information missing, 1: medium confidence, 2: high confidence, 3: very high  confidence). We constructed our data set so that we selected all epitopes that have at least 30 TCRβ sequences with a confidence score at least 1 and found 22 such epitopes.