In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [48]:
plt.style.use('ggplot')

# Args

In [49]:
PLATFORM = "IONTORRENT"
EXP = "exp3"
PRJ = "specificity_matrix"

In [50]:
MAPPING = 'KMA' # BLAST
BARCODE_SYSTEM = 'AKB' #'10x'

## Input data

OBS! version 2.2

In [None]:
library = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_TCR_2.2/library/CDR3_beta1_29_20.xlsx"

In [51]:
merged_annotations = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_CAT_" + PLATFORM + "_" + MAPPING + "_" + BARCODE_SYSTEM + "_2.2/tables/tcr_barcode.xlsx"

## Output

In [52]:
output = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_CAT_" + PLATFORM + "_" + MAPPING + "_" + BARCODE_SYSTEM + "/tables/tcr_barcode.augmented.csv"

# Load

In [None]:
lib = pd.read_excel(library)

In [53]:
clonotype_barcode_specificity_df = pd.read_excel(merged_annotations, dtype={'single_TRA':object, 'single_TRB':object})

In [54]:
print(clonotype_barcode_specificity_df.shape)
print(clonotype_barcode_specificity_df.gem.unique().shape)
print(clonotype_barcode_specificity_df.dropna(subset=['gem', 'clonotype', 'template_id_mhc']).shape)

(7876, 144)
(7876,)
(3430, 144)


# Preprocessing

In [55]:
credible_df = clonotype_barcode_specificity_df[(clonotype_barcode_specificity_df.credible_alignment_mhc == True)].copy()
#credible_df['num_clonotype'] = pd.to_numeric(credible_df['clonotype'].fillna('None').str.split('clonotype').str[1], errors='coerce').replace(np.nan, 0, regex=True).astype(int)

In [56]:
print(credible_df.shape)
print(credible_df.gem.unique().shape)
print(credible_df.dropna(subset=['gem', 'clonotype', 'template_id_mhc']).shape)
print(credible_df.dropna(subset=['gem', 'clonotype', 'template_id_mhc']).gem.unique().shape)

(7166, 145)
(7166,)
(3430, 145)
(3430,)


## Cleaning dataframe
Remove clonotype NaN, None, 0. Remove epitope 0.

In [57]:
credible_df.drop(credible_df[credible_df.num_clonotype == 0].index, inplace=True)
credible_df.drop(credible_df[credible_df.epitope == '0'].index, inplace=True)
credible_df.drop(credible_df[credible_df.epitope == 'Unnamed: 39'].index, inplace=True)

In [58]:
print(credible_df.shape)
print(credible_df.gem.unique().shape)

(2670, 145)
(2670,)


## Annotate GEMs with single/multiple T-cells

In [59]:
credible_df['single_tcell'] = np.where((credible_df.single_TRA) & (credible_df.single_TRB), True, False)

## Computing sum of TRA and TRB UMIs

In [60]:
credible_df['umis_tcr'] = credible_df.umis_TRA.values + credible_df.umis_TRB.values

## Produce y-tick labels

In [61]:
credible_df['peptide_HLA'] = credible_df.peptide.str.split('_').str[0] + ' ' + credible_df.HLA

## Binding concordance

In [62]:
def calc_binding_concordance(df):
    assert df.size > 0, "df empty"
    gems_per_specificity_df = df.groupby(['clonotype','epitope']).gem.count().to_frame().reset_index()
    gems_per_specificity_df.rename(columns={'gem': 'gems_per_specificity'}, inplace=True)
    
    gems_per_clonotype_df = df.groupby(['clonotype']).gem.count().to_frame().reset_index()
    gems_per_clonotype_df.rename(columns={'gem': 'gems_per_clonotype'}, inplace=True)
    
    df = pd.merge(df, gems_per_specificity_df, on=['clonotype', 'epitope'], how='left').merge(gems_per_clonotype_df, on='clonotype', how='left')
    df['binding_concordance'] = df.gems_per_specificity / df.gems_per_clonotype
    
    return df

In [63]:
credible_df = calc_binding_concordance(credible_df)

## Sanity check

In [64]:
# Only include GEMs where there is a single clonotype 
if not credible_df.groupby(['gem']).clonotype.nunique().eq(1).all():
    print(credible_df.groupby(['gem']).clonotype.nunique().eq(1))

In [65]:
print(credible_df.shape)
print(credible_df.gem.unique().shape)

(2670, 151)
(2670,)


In [66]:
credible_df

Unnamed: 0,gem,clonotype,cdr3_TRA,cdr3_nt_TRA,cdr3_TRB,cdr3_nt_TRB,umis_TRA,umis_diff_TRA,single_TRA,umis_lst_TRA,...,v7,v8,v9,num_clonotype,single_tcell,umis_tcr,peptide_HLA,gems_per_specificity,gems_per_clonotype,binding_concordance
0,AAACCTGAGTTCGATC-1,clonotype9,CALNTGGFKTIF,TGTGCGCTGAATACTGGAGGCTTCAAAACTATCTTT,CASSPPFLAGSGSSYEQYF,TGTGCCAGCAGCCCCCCCTTTTTAGCTGGTAGCGGGAGCTCCTACG...,4.0,1.000,True,[4],...,,,15.0,9,True,8.0,YSEHPTFTSQY A0101,2,11,0.181818
1,AAACCTGCAGCATGAG-1,clonotype520,,,CASSLEGRDTEAFF,TGTGCCAGCAGCTTGGAGGGAAGGGACACTGAAGCTTTCTTT,,,,,...,,,,520,False,,VTEHDTLLY A0101,1,1,1.000000
2,AAACCTGGTCTTGTCC-1,clonotype2,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,3.0,1.000,True,[3],...,,,,2,True,8.0,IPSINVHHY B3501,1,203,0.004926
3,AAACGGGAGTTCCACA-1,clonotype177,CAMREGETSYDKVIF,TGTGCAATGAGAGAGGGGGAAACCTCCTACGACAAGGTGATATTT,CASRYGLLGGATDTQYF,TGTGCCAGCAGATACGGCCTGTTGGGGGGGGCGACAGATACGCAGT...,17.0,1.000,True,[17],...,,,,177,True,97.0,VTEHDTLLY A0101,1,1,1.000000
4,AAACGGGCAGGTCTCG-1,clonotype4,CAARPGAQKLVF,TGTGCAGCAAGACCGGGAGCCCAGAAGCTGGTATTT,CASSLEGGGTPYEQYF,TGTGCCAGCAGCTTAGAGGGAGGGGGGACCCCCTACGAGCAGTACTTC,5.0,1.000,True,[5],...,,,,4,False,16.0,NLVPMVATV A0201,2,107,0.018692
5,AAACGGGTCATCATTC-1,clonotype525,CAVIQGDSWGKLQF,TGTGCCGTGATTCAGGGTGACAGCTGGGGGAAATTGCAGTTT,CASSLGFSAYAGELFF,TGTGCCAGCAGTCTAGGTTTTTCGGCATACGCCGGGGAGCTGTTTTTT,5.0,1.000,True,[5],...,,,,525,False,17.0,ITDQVPFSV A0201,1,1,1.000000
6,AAACGGGTCCAGAGGA-1,clonotype42,CVYNQGGKLIF,TGTGTTTATAACCAGGGAGGAAAGCTTATCTTC,CASSQARDPTGELFF,TGTGCCAGCAGCCAAGCCCGGGACCCGACCGGGGAGCTGTTTTTT,3.0,1.000,True,[3],...,,,26.0,42,True,12.0,YSEHPTFTSQY A0101,1,3,0.333333
7,AAAGATGAGGGCTTGA-1,clonotype527,CLVGYNTDKLIF,TGCCTCGTGGGGTATAACACCGACAAGCTCATCTTT,CASSEDRGPYNGETQYF,TGTGCCAGCAGCGAGGACAGGGGTCCCTACAATGGGGAGACCCAGT...,2.0,0.000,False,[2 2],...,,,68.0,527,False,19.0,YSEHPTFTSQY A0101,1,1,1.000000
8,AAAGATGAGGTGACCA-1,clonotype528,CARNTGNQFYF,TGTGCCCGGAACACCGGTAACCAGTTCTATTTT,CASSYQTGAAYGYTF,TGTGCCAGCAGTTATCAGACAGGGGCTGCCTATGGCTACACCTTC,4.0,0.250,False,[3 4],...,,,,528,False,9.0,NLVPMVATV A0201,1,1,1.000000
9,AAAGATGAGTGTGAAT-1,clonotype95,,,CASSLVLLGDNEQFF,TGTGCCAGCAGCTTAGTGTTACTAGGGGACAATGAGCAGTTCTTC,,,,,...,,,,95,False,,RAKFKQLL B0801,2,3,0.666667


# Write clean table

In [67]:
credible_df.to_csv(output)

# Extra?
Should I only include GEMs with both a TRA and a TRB?

Should I exclude GEMs where a TCR chain could not be annotated unambiguously?