In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from ast import literal_eval
from matplotlib_venn import venn3
from matplotlib_venn import venn2
import itertools

# Args

In [2]:
def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("'",""))

def literal_converter(val):
    try:
        return literal_eval(val)
    except SyntaxError:
        return np.nan
    except ValueError:
        return np.nan

converters={'peptide_HLA_lst': peptide_hla_converter, 'cdr3_lst_TRA': cdr3_lst_converter, 'cdr3_lst_TRB': cdr3_lst_converter, 'umi_count_lst_mhc': literal_eval, 'umi_count_lst_TRA': literal_converter, 'umi_count_lst_TRB': literal_converter} #

In [3]:
def calc_binding_concordance(df, clonotype_fmt):
    #assert df.size > 0, "df empty"
    gems_per_specificity = df.groupby([clonotype_fmt,'peptide']).gem.count().to_dict()
    df['gems_per_specificity'] = df.set_index([clonotype_fmt,'peptide']).index.map(gems_per_specificity)

    gems_per_clonotype = df.groupby([clonotype_fmt]).gem.count().to_dict()
    df['gems_per_clonotype'] = df[clonotype_fmt].map(gems_per_clonotype)
    
    df['binding_concordance'] = df.gems_per_specificity / df.gems_per_clonotype

In [4]:
def annotate_lst(df, var): # template_id, epitope, peptide, peptide_HLA
    dct = df.groupby('gem')[var].unique().to_dict()
    return df.gem.map(dct)

In [5]:
def annotate_delta_umi(df):
    def calc_delta(x):
        if len(x) == 1:
            return 100
        else:
            return int((x[-1]-x[-2])/float(x[-1])*100)
    return df.umi_count_lst_TRA.apply(calc_delta), df.umi_count_lst_TRB.apply(calc_delta)

def compute_stats(variable):
    #assert df[~df.imputed_TRA.isna()].index.equals(df[~df.imputed_TRB.isna()].index)
    
    imputations = df[~df[variable].isna()]
    return len(imputations.gem.unique()), len(imputations.ct.unique()), len(imputations.cdr3_TRA.unique()), len(imputations.cdr3_TRB.unique()), len(imputations.peptide.unique())

In [6]:
def compute_stats(variable):
    return sum(query_filter), len(hit), "%.2f" % round(len(hit)/sum(query_filter)*100, 2), sum(df.loc[hit.index, variable] != hit.apply(pd.Series)[0])

In [7]:
def annotate_cdr3_TRA(row, threshold):
    query = reference[(reference.peptide_HLA == row.peptide_HLA) & (reference['cdr3_TRB'] == row['cdr3_TRB'])].copy()
    # Find out if any of the annotated CDR3 seqs are in the reference
    
    #query['hit'] = query['cdr3_TRA'].isin(row['cdr3_lst_TRA'])
    
    if row.delta_umi_TRA < threshold:
        query['hit'] = query['cdr3_TRA'].isin(row['cdr3_lst_TRA'])
    else:
        return None
    
    hit = query[query.hit].copy()

    if len(hit) == 1:
        element = hit['cdr3_TRA'].values[0]
        index = row['cdr3_lst_TRA'].index(element)
        return element, row['umi_count_lst_TRA'][index]
    elif len(hit) > 1:
        return row['cdr3_TRA'], row['umi_count_TRA']
    #else:
    #    # Investigate most similar CDR3
    #    sim_query = sim_tra.loc[query['cdr3_TRA'], row['cdr3_lst_TRA']]
    #    
    #    cdr3 = sim_query.max().idxmax()
    #    value = sim_query.max().max()
    #    
    #    if value >= threshold:
    #        return cdr3
    #    else:
    #        return None
            
    return None

In [8]:
def annotate_cdr3_TRB(row, threshold):
    query = reference[(reference.peptide_HLA == row.peptide_HLA) & (reference['cdr3_TRA'] == row['cdr3_TRA'])].copy()
    
    #query['hit'] = query['cdr3_TRB'].isin(row['cdr3_lst_TRB'])
    
    if row.delta_umi_TRB < threshold:
        query['hit'] = query['cdr3_TRB'].isin(row['cdr3_lst_TRB'])
    else:
        return None
    
    hit = query[query.hit].copy()

    if len(hit) == 1:
        element = hit['cdr3_TRB'].values[0]
        index = row['cdr3_lst_TRB'].index(element)
        return element, row['umi_count_lst_TRB'][index]
    elif len(hit) > 1:
        return row['cdr3_TRB'], row['umi_count_TRB']
    #else:
    #    # Investigate most similar CDR3
    #    sim_query = sim_trb.loc[query['cdr3_TRB'], row['cdr3_lst_TRB']]
    #    
    #    cdr3 = sim_query.max().idxmax()
    #    value = sim_query.max().max()
    #    
    #    if value >= threshold:
    #        return cdr3
    #    else:
    #        return None
        
    # Or since I only want perfect matches look in VDJdb?
    
    return None

In [9]:
def annotate_peptide(row, threshold):
    query = reference[(reference.cdr3_TRA == row.cdr3_TRA) & (reference.cdr3_TRB == row.cdr3_TRB)].copy()
    
    if row.delta_umi_mhc < threshold:
        query['hit'] = query['peptide_HLA'].isin(row['peptide_HLA_lst'])
    else:
        return None
    
    hit = query[query.hit].copy()

    if len(hit) == 1:
        element = hit['peptide_HLA'].values[0]
        index = row['peptide_HLA_lst'].index(element)
        return element, row['umi_count_lst_mhc'][index]
    elif len(hit) > 1:
        #print(row['peptide_HLA'])
        #print(hit['peptide_HLA'].values)
        return row['peptide_HLA'], row['umi_count_mhc']
    else:
        return None
    
    # Or since I only want perfect matches look in VDJdb?

In [10]:
def annotate_TRA_PEP(row, threshold):
    query = reference[(reference['cdr3_TRB'] == row['cdr3_TRB'])].copy()
    # Find out if any of the annotated CDR3 seqs are in the reference
    
    if row.delta_umi_TRA < threshold:
        query['hit_TRA'] = query['cdr3_TRA'].isin(row['cdr3_lst_TRA'])
    else:
        query['hit_TRA'] = query['cdr3_TRA'].isin([row['cdr3_TRA']])
    if row.delta_umi_mhc < threshold:
        query['hit_PEP'] = query['peptide_HLA'].isin(row['peptide_HLA_lst'])
    else:
        query['hit_PEP'] = query['peptide_HLA'].isin([row['peptide_HLA']])
    
    hit = query[query.hit_TRA & query.hit_PEP].copy()

    if len(hit) == 1:
        cdr3, pep = hit[['cdr3_TRA', 'peptide_HLA']].values[0]
        ic, ip = row['cdr3_lst_TRA'].index(cdr3), row['peptide_HLA_lst'].index(pep)
        return cdr3, row['umi_count_lst_TRA'][ic], pep, row['umi_count_lst_mhc'][ip]
    elif len(hit) > 1:
        return row[['cdr3_TRA', 'umi_count_TRA', 'peptide_HLA', 'umi_count_mhc']]
    else:
        return None

In [11]:
def annotate_TRB_PEP(row, threshold):
    query = reference[(reference['cdr3_TRA'] == row['cdr3_TRA'])].copy()
    # Find out if any of the annotated CDR3 seqs are in the reference
    
    if row.delta_umi_TRB < threshold:
        query['hit_TRB'] = query['cdr3_TRB'].isin(row['cdr3_lst_TRB'])
    else:
        query['hit_TRB'] = query['cdr3_TRB'].isin([row['cdr3_TRB']])
    if row.delta_umi_mhc < threshold:
        query['hit_PEP'] = query['peptide_HLA'].isin(row['peptide_HLA_lst'])
    else:
        query['hit_PEP'] = query['peptide_HLA'].isin([row['peptide_HLA']])
    
    hit = query[query.hit_TRB & query.hit_PEP].copy()

    if len(hit) == 1:
        cdr3, pep = hit[['cdr3_TRB', 'peptide_HLA']].values[0]
        ic, ip = row['cdr3_lst_TRB'].index(cdr3), row['peptide_HLA_lst'].index(pep)
        return cdr3, row['umi_count_lst_TRB'][ic], pep, row['umi_count_lst_mhc'][ip]
    elif len(hit) > 1:
        return row[['cdr3_TRB', 'umi_count_TRB', 'peptide_HLA', 'umi_count_mhc']]
    else:
        return None
        # Look into VDJdb
        #hit = set(itertools.product(row['cdr3_lst_TRB'],row['peptide_lst'])).intersection(vdjdb)
        #if len(hit) == 1:
        #    peptide = list(hit)[0][1]
        #    for peptide_HLA in row['peptide_HLA_lst']:
        #        if peptide in peptide_HLA:
        #            break
        #    return list(hit)[0][0], peptide_HLA
        #elif len(hit) > 1:
        #    print(hit)
        #else:
        #    return None

In [12]:
def annotate_TRA_TRB(row, threshold):
    # Too uncertain imputations?!
    query = reference[(reference['peptide_HLA'] == row['peptide_HLA'])].copy()
    # Find out if any of the annotated CDR3 seqs are in the reference
    
    if row.delta_umi_TRA < threshold:
        query['hit_TRA'] = query['cdr3_TRA'].isin(row['cdr3_lst_TRA'])
    else:
        query['hit_TRA'] = query['cdr3_TRA'].isin([row['cdr3_TRA']])
    if row.delta_umi_TRB < threshold:
        query['hit_TRB'] = query['cdr3_TRB'].isin(row['cdr3_lst_TRB'])
    else:
        query['hit_TRB'] = query['cdr3_TRB'].isin([row['cdr3_TRB']])
    
    hit = query[query.hit_TRA & query.hit_TRB].copy()

    if len(hit) == 1:
        tra, trb = hit[['cdr3_TRA', 'cdr3_TRB']].values[0]
        ia, ib = row['cdr3_lst_TRA'].index(tra), row['cdr3_lst_TRB'].index(trb)
        return tra, row['umi_count_lst_TRA'][ia], trb, row['umi_count_lst_TRB'][ib]
    elif len(hit) > 1:
        return row[['cdr3_TRA', 'umi_count_TRA', 'cdr3_TRB', 'umi_count_TRB']]
    else:
        return None

In [13]:
def annotate_cdr3_TRA_NA(row, threshold):
    assert row.cdr3_TRA != np.nan
    
    hit = reference[(reference.peptide_HLA == row.peptide_HLA) & (reference['cdr3_TRB'] == row['cdr3_TRB'])].copy()

    if len(hit) == 1:
        return hit[['cdr3_TRA', 'umi_count_TRA']].values[0]
    else:
        return None

In [14]:
def annotate_cdr3_TRB_NA(row, threshold):
    assert row.cdr3_TRB != np.nan
    
    hit = reference[(reference.peptide_HLA == row.peptide_HLA) & (reference['cdr3_TRA'] == row['cdr3_TRA'])].copy()

    if len(hit) == 1:
        return hit[['cdr3_TRB', 'umi_count_TRB']].values[0]
    else:
        return None

In [15]:
def assign_clonotype(tcr_df):
    clonotype_variables = ['cdr3_TRA','cdr3_TRB']
    tcr_df.loc[:, clonotype_variables] = tcr_df.loc[:, clonotype_variables].fillna('unknown')
    new_clonotype = tcr_df.groupby(clonotype_variables).gem.unique().to_frame()
    new_clonotype['n_gems'] = new_clonotype.gem.apply(len)
    new_clonotype.sort_values(by='n_gems', ascending=False, inplace=True)
    dct = new_clonotype.to_dict()['gem']
    for i, k in enumerate(dct.keys(), start=1): 
        dct[k] = i
    return tcr_df.set_index(clonotype_variables).index.map(dct)

In [55]:
EXP = "exp3"

In [56]:
concordance_threshold = 0.5
threshold_delta_umi = 100

# Input

OBS! version 2.2

In [57]:
library = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp3_TCR/library/CDR3_beta1_29_20.xlsx"

In [58]:
TCR_BARCODE = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_CAT_IONTORRENT_KMA_AKB/tables/tcr_barcode.cleaned.csv"
ORIGINAL    = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_TCR/processed/cellranger_out/TCR_VDJ/outs/all_contig_annotations.csv"
SIM_TRA     = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/kernel_similarity_tra.{}.tab".format(EXP)
SIM_TRB     = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/kernel_similarity_trb.{}.tab".format(EXP)

# Load

In [59]:
lib = pd.read_excel(library, names=['cdr3_TRB', 'HLA', 'peptide', 'cdr3_len'])
lib = lib[lib.peptide.isin(lib.peptide.value_counts()[lib.peptide.value_counts() > 100].index)]

In [60]:
#unique_gems = np.loadtxt(GEMS, dtype=str)

In [61]:
df = pd.read_csv(TCR_BARCODE, converters=converters)#, converters=converters
#df = df[df.gem.isin(unique_gems)]
calc_binding_concordance(df, 'ct')

In [62]:
df['chain_count_TRA'] = df.apply(lambda x: len(x.cdr3_lst_TRA) if x.cdr3_lst_TRA[0] != '' else 0, axis=1)
df['chain_count_TRB'] = df.apply(lambda x: len(x.cdr3_lst_TRB) if x.cdr3_lst_TRB[0] != '' else 0, axis=1)

In [63]:
sim_tra = pd.read_csv(SIM_TRA, index_col=0)
sim_trb = pd.read_csv(SIM_TRB, index_col=0)

In [64]:
tcr_df = pd.read_csv(ORIGINAL)

# Main Verify

Verify after imputation? Thus, we get another measure for how likely the imputations are? Or compare before and after? How many of the reference sequences are in the VDJdb?

In [65]:
vdjdb = set(list(zip(lib.cdr3_TRB, lib.peptide)))

In [66]:
df['VDJdb_verified'] = df.apply(lambda x: (x.cdr3_TRB, x.peptide) in vdjdb, axis=1)

# Main Impute

In [67]:
colT = ['gem', 'num_clonotype', 'ct',
        'cdr3_TRA', 'umi_count_TRA','cdr3_lst_TRA', 'umi_count_lst_TRA', 'cdr3_TRB', 'cdr3_lst_TRB', 'umi_count_lst_TRB',
        'binding_concordance', 'peptide_HLA', 'peptide_HLA_lst', 'multiplets_mhc']

colR = ['gem', 'ct', 'cdr3_TRA','umi_count_TRA', 'cdr3_TRB','umi_count_TRB', 'peptide_HLA', 'umi_count_mhc', 'peptide']

colB = ['gem', 'ct', 'cdr3_TRA', 'cdr3_TRB', 'peptide_HLA']

## Annotate unambiguous specificities (reference specificities)

In [68]:
query_filter = ((df.multiplets_mhc == 1) &
                (df.chain_count_TRA == 1) &
                (df.chain_count_TRB == 1) & (df.binding_concordance >= concordance_threshold)) #(df.binding_concordance == 1)

### Reference

In [69]:
reference = df[query_filter].drop_duplicates(subset=['cdr3_TRA', 'cdr3_TRB', 'peptide_HLA']).loc[:, colR]
reference.sort_values(by='peptide_HLA')#.head(50)

Unnamed: 0,gem,ct,cdr3_TRA,umi_count_TRA,cdr3_TRB,umi_count_TRB,peptide_HLA,umi_count_mhc,peptide
2366,TCTATTGGTAGCGTAG-1,2318.0,CAVDAGKLIF,6.0,CASSQDYNEQFF,18.0,AGYLMELCC A0201,1.0,AGYLMELCC
1881,GGGAGATAGGCAGGTT-1,2077.0,CVVLGQNFVF,1.0,CASSPTYGGGPNSPLHF,6.0,AKYLMELTM A0201,1.0,AKYLMELTM
1029,CGAACATAGACTACAA-1,1505.0,CAYRSFSYNDMRF,4.0,CASSPILREGPGGELFF,4.0,AMLGTHTMEV A0201,1.0,AMLGTHTMEV
907,CCACTACCATGGAATA-1,263.0,CAVLMDSNYQLIW,6.0,CASSADGMNTEAFF,8.0,CLGGLLTMV A0201,3.0,CLGGLLTMV
1622,GCAAACTTCTTGTACT-1,3918.0,CAVSGAGSYQLTF,1.0,CASSLEGQASSYEQYF,24.0,CLGGLLTMV A0201,1.0,CLGGLLTMV
...,...,...,...,...,...,...,...,...,...
1549,GAGCAGATCGGTGTCG-1,1162.0,CAMREGGNFNKFYF,2.0,CSVGQALYNEQFF,6.0,YVLDHLIVV A0201,3.0,YVLDHLIVV
240,ACGCCGATCAGGCCCA-1,2185.0,CAVKDTDKLIF,1.0,CASSQAGVYYGYTF,7.0,YVLDHLIVV A0201,2.0,YVLDHLIVV
803,CAGGTGCTCCAGTATG-1,890.0,CALRYNTDKLIF,1.0,CASSWTGSYEQYF,10.0,YVLDHLIVV A0201,1.0,YVLDHLIVV
524,ATCATCTAGAGTGAGA-1,1580.0,CAYRRGGATNKLIF,7.0,CSATTWTSGGLTDTQYF,13.0,YVLDHLIVV A0201,1.0,YVLDHLIVV


### uniquely imputed TRA and TRB

In [70]:
df.loc[query_filter, 'imputed_TRA'] = df[query_filter].cdr3_TRA
df.loc[query_filter, 'imputed_TRB'] = df[query_filter].cdr3_TRB
df.loc[query_filter, 'imputed_PEP'] = df[query_filter].peptide_HLA
df.loc[query_filter, 'imputed_umi_TRA'] = df[query_filter].umi_count_TRA
df.loc[query_filter, 'imputed_umi_TRB'] = df[query_filter].umi_count_TRB
df.loc[query_filter, 'imputed_umi_PEP'] = df[query_filter].umi_count_mhc
df.loc[query_filter, 'imputation_category'] = 'unique'

In [71]:
assert len(df.imputation_category.dropna()) == sum(query_filter)

In [72]:
df.loc[query_filter,colR + ['imputed_TRA', 'imputed_TRB', 'imputed_PEP', 'imputation_category']].dropna(subset=colR)

Unnamed: 0,gem,ct,cdr3_TRA,umi_count_TRA,cdr3_TRB,umi_count_TRB,peptide_HLA,umi_count_mhc,peptide,imputed_TRA,imputed_TRB,imputed_PEP,imputation_category
5,AAACGGGAGTTCCACA-1,567.0,CAMREGETSYDKVIF,17.0,CASRYGLLGGATDTQYF,73.0,VTEHDTLLY A0101,1.0,VTEHDTLLY,CAMREGETSYDKVIF,CASRYGLLGGATDTQYF,VTEHDTLLY A0101,unique
8,AAACGGGTCATCATTC-1,2885.0,CAVIQGDSWGKLQF,5.0,CASSLGFSAYAGELFF,11.0,ITDQVPFSV A0201,1.0,ITDQVPFSV,CAVIQGDSWGKLQF,CASSLGFSAYAGELFF,ITDQVPFSV A0201,unique
12,AAAGATGCAATAGAGT-1,2596.0,CATEENTGFQKLVF,38.0,CASTLRSYNEQFF,64.0,SLAAYIPRL A0201,1.0,SLAAYIPRL,CATEENTGFQKLVF,CASTLRSYNEQFF,SLAAYIPRL A0201,unique
15,AAAGATGGTCGGCTCA-1,3103.0,CAASGGGSTLGRLYF,1.0,CASRMTALTEAFF,11.0,YSEHPTFTSQY A0101,1.0,YSEHPTFTSQY,CAASGGGSTLGRLYF,CASRMTALTEAFF,YSEHPTFTSQY A0101,unique
23,AAAGCAAGTGACTCAT-1,3375.0,CAGASGGGSQGNLIF,37.0,CASSRRSSYEQYF,36.0,GILGFVFTL A0201,1.0,GILGFVFTL,CAGASGGGSQGNLIF,CASSRRSSYEQYF,GILGFVFTL A0201,unique
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2685,TTGCGTCGTTCAACCA-1,35.0,CAARGGAQKLVF,2.0,CASSLEFDRRPYEQYF,4.0,VTEHDTLLY A0101,1.0,VTEHDTLLY,CAARGGAQKLVF,CASSLEFDRRPYEQYF,VTEHDTLLY A0101,unique
2696,TTGTAGGGTTGCTCCT-1,2402.0,CAARLIQGAQKLVF,18.0,CASSFRGSGEKLFF,46.0,ELAGIGILTV A0201,2.0,ELAGIGILTV,CAARLIQGAQKLVF,CASSFRGSGEKLFF,ELAGIGILTV A0201,unique
2714,TTTCCTCTCGAATGGG-1,3505.0,CGTETNTGNQFYF,1.0,CASIRGSGANVLTF,1.0,QIDVSQFGSY A0101,4.0,QIDVSQFGSY,CGTETNTGNQFYF,CASIRGSGANVLTF,QIDVSQFGSY A0101,unique
2720,TTTGGTTAGACACTAA-1,351.0,CAVCSDYKLSF,19.0,CASGDGGNEQFF,55.0,KTWGQYWQV A0201,3.0,KTWGQYWQV,CAVCSDYKLSF,CASGDGGNEQFF,KTWGQYWQV A0201,unique


In [73]:
hit = df[query_filter].peptide_HLA

In [74]:
stats = pd.DataFrame(index=['potential', 'imputations', 'percent', 'corrected'])

In [75]:
stats['ref'] = compute_stats('peptide_HLA')
stats

Unnamed: 0,ref
potential,551.0
imputations,551.0
percent,100.0
corrected,0.0


## Annotate multiplets in one category

In [76]:
query_filter = ((df.multiplets_mhc == 1) & (df.chain_count_TRA > 1) & (df.chain_count_TRB == 1 ))

hit = df[query_filter].apply(lambda row: annotate_cdr3_TRA(row, threshold_delta_umi), axis=1).dropna()

df[['imputed_TRA', 'imputed_umi_TRA']] = hit.apply(pd.Series)

df.loc[hit.index, 'imputation_category'] = 'TRA'

stats['TRA'] = compute_stats('cdr3_TRA')

In [77]:
df.loc[(df.umi_count_mhc > 1) & (df.imputation_category == 'TRA') & (df.cdr3_TRA!=df.imputed_TRA),
       colB + ['imputed_TRA']].to_csv("stats/{}/{}/{}".format(EXP, 'imputed_new-clonotype', 'imputed_TRA.csv'), index=False)

In [78]:
query_filter = ((df.multiplets_mhc == 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB > 1 ))

hit = df[query_filter].apply(lambda row: annotate_cdr3_TRB(row, threshold_delta_umi), axis=1).dropna()

df[['imputed_TRB', 'imputed_umi_TRB']] = hit.apply(pd.Series)

df.loc[hit.index, 'imputation_category'] = 'TRB'

stats['TRB'] = compute_stats('cdr3_TRB')

In [79]:
df.loc[(df.umi_count_mhc > 1) & (df.imputation_category == 'TRB') & (df.cdr3_TRB!=df.imputed_TRB),
       colB + ['imputed_TRB']].to_csv("stats/{}/{}/{}".format(EXP, 'imputed_new-clonotype', 'imputed_TRB.csv'), index=False)

In [80]:
query_filter = ((df.multiplets_mhc > 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB == 1 ))

hit = df[query_filter].apply(lambda row: annotate_peptide(row, threshold_delta_umi), axis=1).dropna()

df[['imputed_PEP', 'imputed_umi_PEP']] = hit.apply(pd.Series)

df.loc[hit.index, 'imputation_category'] = 'PEP'

stats['PEP'] = compute_stats('peptide_HLA')

In [81]:
df.loc[(df.imputed_umi_PEP > 1) & (df.imputation_category == 'PEP') & (df.peptide_HLA!=df.imputed_PEP),
       colB + ['imputed_PEP','imputed_umi_PEP']].to_csv("stats/{}/{}/{}".format(EXP, 'imputed_new-clonotype', 'imputed_PEP.csv'), index=False)

In [82]:
stats

Unnamed: 0,ref,TRA,TRB,PEP
potential,551.0,129.0,116.0,717.0
imputations,551.0,11.0,12.0,219.0
percent,100.0,8.53,10.34,30.54
corrected,0.0,7.0,1.0,21.0


In [83]:
len(df.imputation_category.dropna())

793

## Annotate NAs in TCRs

In [84]:
query_filter = ((df.multiplets_mhc == 1) & (df.chain_count_TRA == 0) & (df.chain_count_TRB == 1 ))

hit = df[query_filter].apply(lambda row: annotate_cdr3_TRA_NA(row, threshold_delta_umi), axis=1).dropna()

df[['imputed_TRA', 'imputed_umi_TRA']] = hit.apply(pd.Series)

df.loc[hit.index, 'imputation_category'] = 'TRA_NA'

stats['TRA'] = compute_stats('cdr3_TRA')

In [85]:
kept_idx = df.loc[hit.index, 'umi_count_mhc'].apply(lambda x: True if x > 1 else False)
df.loc[kept_idx[kept_idx].index, 'gem'].to_list()

['ACCTTTAGTCATGCCG-1',
 'ACGATGTCACCGCTAG-1',
 'AGTGAGGGTTCTCATT-1',
 'ATAACGCGTGATAAAC-1',
 'ATAGACCAGATGTAAC-1',
 'CACCAGGCAGGGATTG-1',
 'CAGCATATCACATGCA-1',
 'CAGCTAAGTCGCTTTC-1',
 'CATATTCTCCGCGGTA-1',
 'CATGGCGAGAGATGAG-1',
 'CATGGCGGTAGCTAAA-1',
 'CCACGGAGTTCATGGT-1',
 'CCTTTCTAGTCAAGGC-1',
 'CTACATTGTCTAAAGA-1',
 'CTACCCACATTAACCG-1',
 'CTTAGGACATGCCTAA-1',
 'GAAATGATCATTGCCC-1',
 'GAATAAGTCTCGAGTA-1',
 'GATGAAATCAGATAAG-1',
 'GCACTCTCAAGACGTG-1',
 'GGACAGAAGCATGGCA-1',
 'GGGCATCAGTCAATAG-1',
 'TACTCATTCCAGGGCT-1',
 'TCAGGTACAAGGCTCC-1',
 'TGAAAGATCATCTGTT-1']

In [86]:
query_filter = ((df.multiplets_mhc == 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB == 0 ))

hit = df[query_filter].apply(lambda row: annotate_cdr3_TRB_NA(row, threshold_delta_umi), axis=1).dropna()

df[['imputed_TRB', 'imputed_umi_TRB']] = hit.apply(pd.Series)

df.loc[hit.index, 'imputation_category'] = 'TRB_NA'

stats['TRB'] = compute_stats('cdr3_TRB')

In [87]:
stats

Unnamed: 0,ref,TRA,TRB,PEP
potential,551.0,573.0,145.0,717.0
imputations,551.0,61.0,11.0,219.0
percent,100.0,10.65,7.59,30.54
corrected,0.0,61.0,11.0,21.0


In [88]:
len(df.imputation_category.dropna())

865

# Annotate multiplets in 2 categories

Dont run

In [42]:
filter_A = ((df.multiplets_mhc > 1) & (df.chain_count_TRA > 1) & (df.chain_count_TRB == 1 ))
filter_B = ((df.multiplets_mhc > 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB > 1 ))
filter_P = ((df.multiplets_mhc == 1) & (df.chain_count_TRA > 1) & (df.chain_count_TRB > 1 ))

hit = df[filter_A].apply(lambda row: annotate_TRA_PEP(row, threshold_delta_umi), axis=1).dropna()
df[['imputed_TRA', 'imputed_umi_TRA', 'imputed_PEP', 'imputed_umi_PEP']] = hit.apply(pd.Series)
df[['imputed_TRB', 'imputed_umi_TRB']] = df.loc[hit.index, ['cdr3_TRB', 'umi_count_TRB']]
df.loc[hit.index, 'imputation_category'] = 'TRA_PEP'

hit = df[filter_B].apply(lambda row: annotate_TRB_PEP(row, threshold_delta_umi), axis=1).dropna()
df[['imputed_TRB', 'imputed_umi_TRB', 'imputed_PEP', 'imputed_umi_PEP']] = hit.apply(pd.Series)
df[['imputed_TRA', 'imputed_umi_TRA']] = df.loc[hit.index, ['cdr3_TRA', 'umi_count_TRA']]
df.loc[hit.index, 'imputation_category'] = 'TRB_PEP'

hit = df[filter_P].apply(lambda row: annotate_TRA_TRB(row, threshold_delta_umi), axis=1).dropna()
df[['imputed_TRA', 'imputed_umi_TRA', 'imputed_TRB', 'imputed_umi_TRB']] = hit.apply(pd.Series)
df[['imputed_PEP', 'imputed_umi_PEP']] = df.loc[hit.index, ['peptide_HLA', 'umi_count_mhc']]
df.loc[hit.index, 'imputation_category'] = 'TRA_TRB'

#df.loc[:, colR + ['umi_count_lst_TRA', 'umi_count_lst_mhc','imputed_TRA', 'imputed_TRB']].dropna()

In [71]:
len(df.imputation_category.dropna())

683

# Write outout

df.to_csv('tcr_barcode.cleaned.imputed.csv', index=False)

In [89]:
df_modified = df[['gem','peptide','epitope']].copy()

imputed_indexes = df.imputation_category.dropna().index

In [90]:
df_modified['cdr3_TRA'] = np.where(df.imputed_TRA.isna(), df.cdr3_TRA, df.imputed_TRA)
df_modified['cdr3_TRB'] = np.where(df.imputed_TRB.isna(), df.cdr3_TRB, df.imputed_TRB)
df_modified['peptide_HLA'] = np.where(df.imputed_PEP.isna(), df.peptide_HLA, df.imputed_PEP)
#
df_modified['umi_count_TRA'] = np.where(df.imputed_umi_TRA.isna(), df.umi_count_TRA, df.imputed_umi_TRA)
df_modified['umi_count_TRB'] = np.where(df.imputed_umi_TRB.isna(), df.umi_count_TRB, df.imputed_umi_TRB)
df_modified['umi_count_mhc'] = np.where(df.imputed_umi_PEP.isna(), df.umi_count_mhc, df.imputed_umi_PEP)

In [91]:
df_modified['ct'] = assign_clonotype(df_modified)

In [92]:
cols = ['gem','cdr3_TRA','umi_count_TRA','cdr3_TRB','umi_count_TRB','peptide','umi_count_mhc','peptide_HLA','ct','epitope']

In [93]:
#df_modified.loc[imputed_indexes, cols].to_csv('tcr_barcode.cleaned.imputed.nonimputed_discarded.csv', index=False)
df_modified.loc[:, cols].to_csv('tcr_barcode.cleaned.imputed.{}.csv'.format(EXP), index=False)

# Testing

### Annotate multiplets in one category - Investigate effect of lower similarity threshold

In [41]:
filter_A = ((df.multiplets_mhc == 1) & (df.chain_count_TRA > 1) & (df.chain_count_TRB == 1 ))
filter_B = ((df.multiplets_mhc == 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB > 1 ))
filter_P = ((df.multiplets_mhc > 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB == 1 ))

for threshold in [1.0, 0.99, 0.9]:
    print(threshold)
    for chain, query_filter in [('TRA', filter_A), ('TRB', filter_B), ('PEP', filter_P)]:
        
        if chain == 'TRA':
            hit = df[query_filter].apply(lambda row: annotate_cdr3_TRA(row, threshold), axis=1).dropna()

            #df.loc[hit.index, 'imputed_TRA'] = hit
            #df.loc[hit.index, 'imputed_TRB'] = df.loc[hit.index].cdr3_TRB

            s['TRA'][threshold] = compute_stats('cdr3_TRA')
            
        elif chain == 'TRB':
            hit = df[query_filter].apply(lambda row: annotate_cdr3_TRB(row, threshold), axis=1).dropna()

            #df.loc[hit.index, 'imputed_TRA'] = df.loc[hit.index].cdr3_TRA
            #df.loc[hit.index, 'imputed_TRB'] = hit
            
            s['TRB'][threshold] = compute_stats('cdr3_TRB')
            
        else:
            hit = df[query_filter].apply(lambda row: annotate_peptide(row, threshold), axis=1).dropna()
            
            s['PEP'][threshold] = compute_stats('peptide_HLA')

1.0
0.99
0.9


In [49]:
s['TRA']

Unnamed: 0,ref,1.0,0.99,0.9
potential,397.0,140.0,140.0,571.0
imputations,397.0,9.0,9.0,9.0
percent,100.0,6.43,6.43,1.58
corrected,0.0,2.0,2.0,2.0


In [50]:
s['TRB']

Unnamed: 0,ref,1.0,0.99,0.9
potential,397.0,167.0,167.0,571.0
imputations,397.0,27.0,27.0,27.0
percent,100.0,16.17,16.17,4.73
corrected,0.0,5.0,5.0,5.0


In [51]:
s['PEP']

Unnamed: 0,ref,1.0,0.99,0.9
potential,397.0,571.0,571.0,571.0
imputations,397.0,161.0,161.0,161.0
percent,100.0,28.2,28.2,28.2
corrected,0.0,15.0,15.0,15.0


### Annotate NAs in TCRs - Investigate effect of lower similarity threshold

In [72]:
filter_A = ((df.multiplets_mhc == 1) & (df.chain_count_TRA == 0) & (df.chain_count_TRB == 1 ))
filter_B = ((df.multiplets_mhc == 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB == 0 ))

for threshold in [1.0, 0.99, 0.9]:
    print(threshold)
    for chain, query_filter in [('TRA', filter_A), ('TRB', filter_B)]:
        #df['imputed_TRA'] = np.nan
        #df['imputed_TRB'] = np.nan
        #df['imputed_PEP'] = np.nan
        
        if chain == 'TRA':
            hit = df[query_filter].apply(lambda row: annotate_cdr3_TRA_NA(row, threshold), axis=1).dropna()
            
            #df.loc[hit.index, 'imputed_TRA'] = hit
            #df.loc[hit.index, 'imputed_TRB'] = df.loc[hit.index].cdr3_TRB

            s['TRA'][threshold] = compute_stats('cdr3_TRA')
            
        elif chain == 'TRB':
            hit = df[query_filter].apply(lambda row: annotate_cdr3_TRB_NA(row, threshold), axis=1).dropna()

            #df.loc[hit.index, 'imputed_TRA'] = df.loc[hit.index].cdr3_TRA
            #df.loc[hit.index, 'imputed_TRB'] = hit
            
            s['TRB'][threshold] = compute_stats('cdr3_TRB')

1.0
0.99
0.9


In [73]:
s['TRA']

Unnamed: 0,ref,1.0,0.99,0.9
potential,397.0,497.0,497.0,497.0
imputations,397.0,105.0,105.0,105.0
percent,100.0,21.13,21.13,21.13
corrected,0.0,105.0,105.0,105.0


In [74]:
s['TRB']

Unnamed: 0,ref,1.0,0.99,0.9
potential,397.0,83.0,83.0,83.0
imputations,397.0,7.0,7.0,7.0
percent,100.0,8.43,8.43,8.43
corrected,0.0,7.0,7.0,7.0


For every query filter, impute the ambiguous annotations.

In [167]:
filter_A = ((df.multiplets_mhc == 1) & (df.chain_count_TRA > 1) & (df.chain_count_TRB == 1 ))
filter_B = ((df.multiplets_mhc == 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB > 1 ))
filter_P = ((df.multiplets_mhc > 1) & (df.chain_count_TRA == 1) & (df.chain_count_TRB == 1 ))

threshold = 1

hit = df[filter_A].apply(lambda row: annotate_cdr3_TRA(row, threshold), axis=1).dropna()

df.loc[hit.index, colR + ['cdr3_lst_TRA']].merge(hit.to_frame(), left_index=True, right_index=True).sort_values(by='ct')


Unnamed: 0,gem,ct,cdr3_TRA,cdr3_TRB,peptide_HLA,cdr3_lst_TRA,0
1460,GCCAAATCATGTAAGA-1,2.0,CAVRSAYSGAGSYQLTF,CASRPRVAGGRNEQFF,NLVPMVATV A0201,"[CAIRLNRDDKIIF, CAVRSAYSGAGSYQLTF]",CAVRSAYSGAGSYQLTF
922,CGAGAAGTCATATCGG-1,2.0,CAVRSAYSGAGSYQLTF,CASRPRVAGGRNEQFF,YSEHPTFTSQY A0101,"[CATDGNARLMF, CAVRSAYSGAGSYQLTF]",CAVRSAYSGAGSYQLTF
2178,TGGCGCACATCCAACA-1,3.0,CAAKSDSGGGADGLTF,CASSAWTSNRDEQFF,YSEHPTFTSQY A0101,"[CAGASDSWGKLQF, CAAKSDSGGGADGLTF]",CAAKSDSGGGADGLTF
1970,TCAGCTCCAAGAGGCT-1,3.0,CAAKSDSGGGADGLTF,CASSAWTSNRDEQFF,YSEHPTFTSQY A0101,"[CAYSFSGTYKYIF, CAAKSDSGGGADGLTF]",CAAKSDSGGGADGLTF
973,CGCTATCAGTATGACA-1,3.0,CAAKSDSGGGADGLTF,CASSAWTSNRDEQFF,SLAAYIPRL A0201,"[CAVRSAYSGAGSYQLTF, CAAKSDSGGGADGLTF]",CAAKSDSGGGADGLTF
89,AAGGAGCAGCTAAGAT-1,3.0,CAAKSDSGGGADGLTF,CASSAWTSNRDEQFF,VTEHDTLLY A0101,"[CAVGGSGYSTLTF, CAAKSDSGGGADGLTF]",CAAKSDSGGGADGLTF
412,AGGTCATTCCCAGGTG-1,3.0,CAAKSDSGGGADGLTF,CASSAWTSNRDEQFF,SLAAYIPRL A0201,"[CALSPITQGGSEKLVF, CAAKSDSGGGADGLTF]",CAAKSDSGGGADGLTF
636,CACATTTAGTACCGGA-1,6.0,CAGARSYQLTF,CASSPLSLNTEAFF,VLEETSVML A0201,"[CAYSFSGTYKYIF, CAGARSYQLTF]",CAGARSYQLTF
2039,TCTATTGTCTGCTGTC-1,6.0,CAGARSYQLTF,CASSPLSLNTEAFF,AGYLMELCC A0201,"[CAASYSGTYKYIF, CAGARSYQLTF]",CAGARSYQLTF
1935,TATTACCAGATAGTCA-1,6.0,CAGARSYQLTF,CASSPLSLNTEAFF,VTEHDTLLY A0101,"[CTTASWGLKGMNRDDKIIF, CAGARSYQLTF]",CAGARSYQLTF


In [149]:
reference[(reference.cdr3_TRA == 'CAERDSWGKFQF') & (reference.cdr3_TRB == 'CSGDLGRQNTEAFF')]

Unnamed: 0,gem,ct,cdr3_TRA,cdr3_TRB,peptide_HLA
225,ACGGGCTTCGCCATAA-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201


In [73]:
reference[reference.peptide_HLA == 'YSEHPTFTSQY A0101']

Unnamed: 0,gem,ct,cdr3_TRA,cdr3_TRB,peptide_HLA
1,AAACCTGAGTTCGATC-1,24.0,CALNTGGFKTIF,CASSPPFLAGSGSSYEQYF,YSEHPTFTSQY A0101
62,AACTCTTCATCTATGG-1,153.0,CAVSTYGGSQGNLIF,CASSYSRQGMNEQFF,YSEHPTFTSQY A0101
142,ACAGCTACACGTGAGA-1,2.0,CAVRSAYSGAGSYQLTF,CASRPRVAGGRNEQFF,YSEHPTFTSQY A0101
145,ACAGCTATCCGCATCT-1,3.0,CAAKSDSGGGADGLTF,CASSAWTSNRDEQFF,YSEHPTFTSQY A0101
244,ACTGAACAGTGAAGAG-1,817.0,CAESETGANNLFF,CATGIVGQTYEQYF,YSEHPTFTSQY A0101
255,ACTGATGCACCTCGGA-1,3285.0,CISDGGSQGNLIF,CASSLTGLFTGELFF,YSEHPTFTSQY A0101
460,ATAAGAGCATGTCGAT-1,206.0,CVVNTARGYSTLTF,CATANPSGSTDTQYF,YSEHPTFTSQY A0101
497,ATCTACTAGGTGACCA-1,2145.0,CALSEYTGGFKTIF,CASRATSGRAPVGTDTQYF,YSEHPTFTSQY A0101
611,CACACTCGTGACTCAT-1,745.0,CAYRSFYGNNRLAF,CATSSLSTGYTDTQYF,YSEHPTFTSQY A0101
648,CACCACTCATAGAAAC-1,2514.0,CAMNSNSGYALNF,CSARVLGDTSYEQYF,YSEHPTFTSQY A0101


In [120]:
df.loc[(df.ct == 15), colR + ['cdr3_lst_TRA','cdr3_lst_TRB', 'peptide_HLA_lst']].head(50)

Unnamed: 0,gem,ct,cdr3_TRA,cdr3_TRB,peptide_HLA,cdr3_lst_TRA,cdr3_lst_TRB,peptide_HLA_lst
225,ACGGGCTTCGCCATAA-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,[CAERDSWGKFQF],[CSGDLGRQNTEAFF],[NLVPMVATV A0201]
546,ATTCTACAGCGTGAAC-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,[CAERDSWGKFQF],[CSGDLGRQNTEAFF],[NLVPMVATV A0201]
568,CAACCTCGTGAGGGTT-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,[CAERDSWGKFQF],[CSGDLGRQNTEAFF],"[VTEHDTLLY A0101, SLAAYIPRL A0201, NLVPMVATV A..."
678,CAGAGAGTCACTCTTA-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,YSEHPTFTSQY A0101,[CAERDSWGKFQF],[CSGDLGRQNTEAFF],"[FLRGRAYGL B0801, NLVPMVATV A0201, VTEHDTLLY A..."
805,CCAGCGACACATCCAA-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,"[CAVGVRILSLVPEPDCPCCPGGKLIF, CAERDSWGKFQF]","[CASSAWTSNRDEQFF, CSGDLGRQNTEAFF]","[VSDGGPNLY A0101, NLVPMVATV A0201]"
1102,CTACATTGTCGTTGTA-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,[CAERDSWGKFQF],[CSGDLGRQNTEAFF],"[VTEHDTLLY A0101, VSDGGPNLY A0101, NLVPMVATV A..."
1275,GAACCTACATCTATGG-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,[CAERDSWGKFQF],"[CSVEDIRDEQYF, CSGDLGRQNTEAFF]","[VTEHDTLLY A0101, NLVPMVATV A0201]"
1287,GAATAAGCAAAGCGGT-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,[CAERDSWGKFQF],"[CASRPRVAGGRNEQFF, CSGDLGRQNTEAFF]",[NLVPMVATV A0201]
1363,GATCAGTTCCGTCATC-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,SLAAYIPRL A0201,"[CAAGARSASKIIF, CAERDSWGKFQF]","[CATWGAGGLEQYV, CSGDLGRQNTEAFF]",[SLAAYIPRL A0201]
1411,GCAAACTTCTTGCATT-1,15.0,CAERDSWGKFQF,CSGDLGRQNTEAFF,NLVPMVATV A0201,[CAERDSWGKFQF],[CSGDLGRQNTEAFF],[NLVPMVATV A0201]


## Annotate BC multiplets (unique TCRs)

In [372]:
query_filter = ((df.multiplets_mhc > 1) &
                (df.chain_count_TRA == 1) &
                (df.chain_count_TRB == 1 )) #(df.binding_concordance == 1)

In [381]:
for index, row in df[query_filter].iterrows():
    print(index, row.cdr3_TRA, row.cdr3_TRB, row.peptide_HLA_lst)
    hit = annotate_peptide(row, 0.9)
    print(hit)

6 CAARPGAQKLVF CASSLEGGGTPYEQYF ['VTEHDTLLY A0101', 'NLVPMVATV A0201']
VTEHDTLLY A0101
15 CALDVDTGGFKTIF CASGPADTQYF ['NLVPMVATV A0201', 'VTEHDTLLY A0101']
None
28 CAGARSYQLTF CASSPLSLNTEAFF ['SLAAYIPRL A0201', 'AKYLMELTM A0201']
None
40 CALRRGAGGGNKLTF CASSFEQDSESSYNSPLHF ['SLAAYIPRL A0201', 'VTEHDTLLY A0101']
None
53 CAVRDNNQGGKLIF CSAHPPGQGWEKLFF ['YSEHPTFTSQY A0101', 'VTEHDTLLY A0101']
VTEHDTLLY A0101
57 CAVRSAYSGAGSYQLTF CASRPRVAGGRNEQFF ['SLLMWITQA A0201', 'FRQKTNLIL C0702']
FRQKTNLIL C0702
60 CAARPGAQKLVF CASSLEGGGTPYEQYF ['YSEHPTFTSQY A0101', 'VTEHDTLLY A0101']
VTEHDTLLY A0101
67 CVVSDLRATSGTYKYIF CSASPRISYNEQFF ['VTEHDTLLY A0101', 'YSEHPTFTSQY A0101']
None
69 CAASFSGTYKYIF CASSLVAYNEQFF ['VTEHDTLLY A0101', 'YSEHPTFTSQY A0101', 'CLLWSFQTSA A0201']
VTEHDTLLY A0101
73 CAVSSTGGTSYGKLTF CASSFGTAYEQYF ['VVPCEPPEV A0201', 'YLEPGPVTA A0201']
None
76 CAAKGGSEKLVF CASSLQGVGAKNIQYF ['NLVPMVATV A0201', 'VTEHDTLLY A0101']
None
86 CAARPGAQKLVF CASSLEGGGTPYEQYF ['NLVPMVATV A0201', 'VTEHDTLLY

VTEHDTLLY A0101
940 CAVRDNNQGGKLIF CSAHPPGQGWEKLFF ['GLCTLVAML A0201', 'VTEHDTLLY A0101']
VTEHDTLLY A0101
952 CAAKSDSGGGADGLTF CASSAWTSNRDEQFF ['KTWGQYWQV A0201', 'SLAAYIPRL A0201']
SLAAYIPRL A0201
956 CAARGGAQKLVF CASSLEFDRRPYEQYF ['QIDVSQFGSY A0101', 'FLYALALLL A0201', 'VTEHDTLLY A0101']
VTEHDTLLY A0101
959 CAGARSYQLTF CASSPLSLNTEAFF ['ILLLFLAIFI A0201', 'YSEHPTFTSQY A0101', 'FILLLFLAIFI A0201', 'ELRSRYWAI B0801', 'VTEHDTLLY A0101']
                     gem   ct     cdr3_TRA        cdr3_TRB  \
81    AAGCCGCAGTGATCGG-1  6.0  CAGARSYQLTF  CASSPLSLNTEAFF   
1606  GGCTCGACAGGTGGAT-1  6.0  CAGARSYQLTF  CASSPLSLNTEAFF   

                      cdr3_comb      peptide_HLA   hit  
81    CAGARSYQLTFCASSPLSLNTEAFF  VTEHDTLLY A0101  True  
1606  CAGARSYQLTFCASSPLSLNTEAFF  ELRSRYWAI B0801  True  
VTEHDTLLY A0101
976 CAGPSYNTDKLIF CSGIVDYGYTF ['KTWGQYWQV A0201', 'RLAKEWQAF B1501', 'QIDVSQFGSY A0101']
QIDVSQFGSY A0101
981 CAAKSDSGGGADGLTF CASSAWTSNRDEQFF ['SLAAYIPRL A0201', 'LEKARGSTY B1501']
SLAAY

None
1369 CAAGGGGNKLTF CASSWRGSSSYEQYF ['FLYALALLL A0201']
None
1373 CAVSAPGEKIYNQGGKLIF CASSRPRDRSPYNSPLHF ['NLVPMVATV A0201', 'VSDGGPNLY A0101', 'VTEHDTLLY A0101']
None
1376 CAASFQGTYKYIF CASSSLNRREDHNEQFF ['SLAAYIPRL A0201', 'QIDVSQFGSY A0101', 'NLVPMVATV A0201', 'VTEHDTLLY A0101']
None
1382 CAYSFSGTYKYIF CASSLLGTSGTGNEQFF ['MLAVISCAV A0201', 'GPISGHVLK A1101', 'YSEHPTFTSQY A0101', 'CLLWSFQTSA A0201']
                     gem   ct       cdr3_TRA           cdr3_TRB  \
283   ACTTGTTCAATGTTGC-1  9.0  CAYSFSGTYKYIF  CASSLLGTSGTGNEQFF   
1579  GGAGCAAAGGGCACTA-1  9.0  CAYSFSGTYKYIF  CASSLLGTSGTGNEQFF   

                           cdr3_comb        peptide_HLA   hit  
283   CAYSFSGTYKYIFCASSLLGTSGTGNEQFF   CLLWSFQTSA A0201  True  
1579  CAYSFSGTYKYIFCASSLLGTSGTGNEQFF  YSEHPTFTSQY A0101  True  
CLLWSFQTSA A0201
1386 CAAKSDSGGGADGLTF CASSAWTSNRDEQFF ['GPISGHVLK A1101', 'SLAAYIPRL A0201']
                     gem   ct          cdr3_TRA         cdr3_TRB  \
243   ACTGAACAGGGTGTTG-1  3.0  CAAKS

904  CAGARSYQLTFCASSPLSLNTEAFF  AMLGTHTMEV A0201  True  
AMLGTHTMEV A0201
1902 CAVRDNNQGGKLIF CSAHPPGQGWEKLFF ['GILGFVFTL A0201', 'VTEHDTLLY A0101']
VTEHDTLLY A0101
1905 CAVVNRDDKIIF CASSPGGPQSQQFF ['SLPPPGTRV A0201', 'KTWGQYWQV A0201']
KTWGQYWQV A0201
1913 CAGARSYQLTF CASSPLSLNTEAFF ['ALVDAGVPM A0201', 'GPISGHVLK A1101']
                    gem   ct     cdr3_TRA        cdr3_TRB  \
442  AGTTGGTAGCTAGGCA-1  6.0  CAGARSYQLTF  CASSPLSLNTEAFF   
782  CCAATCCGTCCGCTGA-1  6.0  CAGARSYQLTF  CASSPLSLNTEAFF   

                     cdr3_comb      peptide_HLA   hit  
442  CAGARSYQLTFCASSPLSLNTEAFF  ALVDAGVPM A0201  True  
782  CAGARSYQLTFCASSPLSLNTEAFF  GPISGHVLK A1101  True  
GPISGHVLK A1101
1915 CAARPGAQKLVF CASSLEGGGTPYEQYF ['YSEHPTFTSQY A0101', 'VTEHDTLLY A0101']
VTEHDTLLY A0101
1927 CAARPGAQKLVF CASSLEGGGTPYEQYF ['GILGFVFTL A0201', 'YSEHPTFTSQY A0101', 'VTEHDTLLY A0101']
VTEHDTLLY A0101
1931 CAAKSDSGGGADGLTF CASSAWTSNRDEQFF ['GQGGSPTAM B1501', 'ASCMGLIY B3501', 'SLAAYIPRL A0201']
SLAAYIPRL 