In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import sys  
sys.path.insert(0, '../../scripts')

from D_plot_specificity_matrix_utils import calc_binding_concordance

In [3]:
def notnan(x):
    return x == x

In [4]:
def get_multiplets(df):
    #tmp = df[idx1 & idx2]
    dct = df.groupby(['ct','peptide_HLA']).gem.count() > 1
    idx = df.set_index(['ct','peptide_HLA']).index.map(dct)
    return idx.fillna(False)

In [5]:
def check_aa(seq):
    regex = re.compile('^[ARNDCEQGHILKMFPSTWYV]+$')
    match = regex.search(seq)
    if match is not None:
        return True
    else:
        return False

# Input

In [15]:
EXP10x = '../tmp_files/model_data/exp10_x.csv'
EXP131 = '../tmp_files/model_data/exp13_1.csv'
EXP132 = '../tmp_files/model_data/exp13_2.csv'
EXP091 = '../tmp_files/model_data/exp09.csv'

In [77]:
DATA = '../tmp_files/formatted/vdjdb.csv'

# Output

In [78]:
OUT = '../tmp_files/model_data/vdj_test.csv'

# Load

In [79]:
dfs = list()
for f in [EXP10x, EXP131, EXP132, EXP091]:
    dfs.append(pd.read_csv(f, header=None, names=['peptide','mhc','cdr3_a','cdr3_b','label']))
    
tenx = pd.concat(dfs, ignore_index=True)
tenx.drop_duplicates(inplace=True)

In [80]:
#tenx = pd.read_csv(EXP10x, header=None, names=['peptide','mhc','cdr3_a','cdr3_b','label'])

In [81]:
pep2mhc = tenx.set_index('peptide').mhc.to_dict()

In [82]:
df = pd.read_csv(DATA, low_memory=False)

# Filters

In [83]:
idx1 = df.peptide.isin(tenx.peptide)

In [84]:
idx2 = ~df.assay.str.contains('single-cell').fillna(False) # VDJdb specific

In [85]:
idx3 = df.peptide.apply(lambda x: check_aa(x))
idx4 = df.cdr3_a.apply(lambda x: check_aa(str(x)))
idx5 = df.cdr3_b.apply(lambda x: check_aa(str(x)))

In [86]:
idx6 = (df.cdr3_a.str.len() >= 8) & (df.cdr3_a.str.len() <= 19)
idx7 = (df.cdr3_b.str.len() >= 8) & (df.cdr3_b.str.len() <= 19)

In [88]:
#idx8 = df.cell_species == 'Homo sapiens' # IEDB
idx8 = df.cell_species == 'HomoSapiens' # VDJdb

In [89]:
idx9 = ~(df.cdr3_a.isin(tenx.cdr3_a) & df.cdr3_b.isin(tenx.cdr3_b))

In [90]:
idx10 = ~(df.cdr3_a.isin(tenx.cdr3_a.str[1:-1]) & df.cdr3_b.isin(tenx.cdr3_b.str[1:-1]))

# Main

In [91]:
#sub_df = df[idx1 & idx3 & idx4 & idx5 & idx6 & idx7 & idx8 & idx9 & idx10].copy() # IEDB
sub_df = df[idx1 & idx2 & idx3 & idx4 & idx5 & idx6 & idx7 & idx8 & idx9 & idx10].copy() # VDJdb

In [92]:
sub_df.cdr3_a = sub_df.cdr3_a.apply(lambda x: f'C{x}F' if (x[0]!='C') & ((x[-1]!='F') | (x[-1]!='W')) else x)
sub_df.cdr3_b = sub_df.cdr3_b.apply(lambda x: f'C{x}F' if (x[0]!='C') & ((x[-1]!='F') | (x[-1]!='W')) else x)

In [93]:
sub_df.dropna(subset=['cdr3_a','cdr3_b'], inplace=True)
sub_df.drop_duplicates(subset=['peptide','cdr3_a','cdr3_b'], inplace=True)

In [94]:
sub_df

Unnamed: 0,v_a,j_a,cdr1_a,cdr2_a,cdr3_a,v_b,d_b,j_b,cdr1_b,cdr2_b,...,antigen_source,peptide_expression,mhc,mhc_expression,ref_id,pdb_id,assay,database,db_id_tcr,db_id_pep
667,TRAV38-1*01,TRAJ48*01,,,CAYTVLGNEKLTF,TRBV28*01,,TRBJ2-1*01,,,...,HomoSapiens,,HLA-A*02,,PMID:12555663,,"cultured-T-cells,beads,tetramer-sort",VDJdb,,
668,TRAV12-2*01,TRAJ42*01,,,CAVAGYGGSQGNLIF,TRBV28*01,,TRBJ1-1*01,,,...,HomoSapiens,,HLA-A*02,,PMID:12555663,,"cultured-T-cells,beads,tetramer-sort",VDJdb,,
669,TRAV12-2*01,TRAJ48*01,,,CAVSFGNEKLTF,TRBV28*01,,TRBJ1-5*01,,,...,HomoSapiens,,HLA-A*02,,PMID:12555663,,"cultured-T-cells,beads,tetramer-sort",VDJdb,,
670,TRAV12-2*01,TRAJ42*01,,,CAVTHYGGSQGNLIF,TRBV28*01,,TRBJ2-3*01,,,...,HomoSapiens,,HLA-A*02,,PMID:12555663,,"cultured-T-cells,beads,tetramer-sort",VDJdb,,
671,TRAV12-2*01,TRAJ45*01,,,CAGGGGGADGLTF,TRBV28*01,,TRBJ1-5*01,,,...,HomoSapiens,,HLA-A*02,,PMID:12555663,,"cultured-T-cells,beads,tetramer-sort",VDJdb,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51098,TRAV12-3*01,TRAJ27*01,,,CAMTGTTNAGKSTF,TRBV12-3*01,,TRBJ1-2*01,,,...,CMV,,HLA-A*02:01,,https://github.com/antigenomics/vdjdb-db/issue...,,pelimer-sort,VDJdb,,
51099,TRAV3*01,TRAJ31*01,,,CAVRDTNARLMF,TRBV12-3*01,,TRBJ1-1*01,,,...,CMV,,HLA-A*02:01,,https://github.com/antigenomics/vdjdb-db/issue...,,pelimer-sort,VDJdb,,
51101,TRAV26-2*01,TRAJ43*01,,,CIRDNNNDMRF,TRBV7-6*01,TRBD1,TRBJ1-4*01,,,...,CMV,,HLA-A*02:01,,https://github.com/antigenomics/vdjdb-db/issue...,,pelimer-sort,VDJdb,,
51102,TRAV26-2*01,TRAJ43*01,,,CIIDNNNDMRF,TRBV7-6*01,TRBD1,TRBJ1-4*01,,,...,CMV,,HLA-A*02:01,,https://github.com/antigenomics/vdjdb-db/issue...,,pelimer-sort,VDJdb,,


In [95]:
sub_df.mhc = sub_df.peptide.map(pep2mhc)

In [96]:
sub_df['label'] = 1

In [97]:
sub_df.peptide.value_counts()

GILGFVFTL     160
ELAGIGILTV     70
NLVPMVATV      57
GLCTLVAML      41
YVLDHLIVV      11
AVFDRKSDAK      7
IVTDFSVIK       7
TPRVTGGGAM      6
SLLMWITQV       5
LLFGYPVYV       3
FLRGRAYGL       3
IPSINVHHY       3
RMFPNAPYL       2
IMDQVPFSV       1
RPPIFIRRL       1
Name: peptide, dtype: int64

In [40]:
sub_df.peptide.value_counts()

GILGFVFTL     481
NLVPMVATV     234
GLCTLVAML     195
YVLDHLIVV     126
ELAGIGILTV     82
RPPIFIRRL      25
FLRGRAYGL      21
LLFGYPVYV      16
IPSINVHHY      12
IMDQVPFSV       9
IVTDFSVIK       8
TPRVTGGGAM      6
QYDPVAALF       4
CLGGLLTMV       3
RMFPNAPYL       2
ELRRKMMYM       2
AVFDRKSDAK      2
SLLMWITQV       1
RAKFKQLL        1
VTEHDTLLY       1
Name: peptide, dtype: int64

In [98]:
sub_df[['peptide','mhc','cdr3_a','cdr3_b', 'label']]

Unnamed: 0,peptide,mhc,cdr3_a,cdr3_b,label
667,ELAGIGILTV,A0201,CAYTVLGNEKLTF,CASSFTPYNEQFF,1
668,ELAGIGILTV,A0201,CAVAGYGGSQGNLIF,CASSPQGLGTEAFF,1
669,ELAGIGILTV,A0201,CAVSFGNEKLTF,CAEGQGFVGQPQHF,1
670,ELAGIGILTV,A0201,CAVTHYGGSQGNLIF,CASLRSAVWADTQYF,1
671,ELAGIGILTV,A0201,CAGGGGGADGLTF,CASTLTGLGQPQHF,1
...,...,...,...,...,...
51098,NLVPMVATV,A0201,CAMTGTTNAGKSTF,CASTYGSYGYTF,1
51099,NLVPMVATV,A0201,CAVRDTNARLMF,CASSVVTEAFF,1
51101,NLVPMVATV,A0201,CIRDNNNDMRF,CASSLAPGTTNEKLFF,1
51102,NLVPMVATV,A0201,CIIDNNNDMRF,CASSLAPGATNEKLFF,1


In [99]:
sub_df[['peptide','mhc','cdr3_a','cdr3_b','label']].to_csv(OUT, index=False, header=False)