In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

In [2]:
def HLA_cd8_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace(",", "").replace("'","").split(" ")

def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").replace(",","").split(" ") #add .replace(",","")

def epitope_converter(x):
    #define format of datetime
    return [y for y in x.replace("[","").replace("]","").replace("\n","").split("'") if (y != '') & (y != ' ')]

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("\n","").replace("'",""))

def literal_converter(val):
    # replace NaN with '' and perform literal eval on the rest
    return [] if val == '' else literal_eval(val)

converters = {'peptide_HLA_lst': peptide_hla_converter,
              'umi_count_lst_mhc': literal_eval, 'umi_count_lst_cd8': literal_converter,
              'umi_count_lst_TRA': literal_converter,'umi_count_lst_TRB': literal_converter,
              'cdr3_lst_TRA': cdr3_lst_converter,
              'cdr3_lst_TRB': cdr3_lst_converter,
              'genes_lst_TRA': epitope_converter,
              'genes_lst_TRB': epitope_converter,
              'HLA_lst_mhc': cdr3_lst_converter,'HLA_cd8': HLA_cd8_converter, 'sample_id_lst': epitope_converter} #

# Input

In [84]:
exp = 'exp13'#'exp10x'#
run = 'run3'#'run1'#

In [85]:
DATA = f"../../experiments/{exp}/{run}/cat/eval_clonotypes/valid_ct.csv"
THR = f"../../experiments/{exp}/{run}/cat/eval_clonotypes/threshold/opt.csv"

In [86]:
OUT_DIR = f"../tmp_files/publication_data/{exp}/"

# Variables

In [87]:
OUT_COL = [
    'gem','clonotype','ct',
    'genes_TRA','genes_lst_TRA','genes_TRB','genes_lst_TRB',
    'cdr3_TRA','cdr3_lst_TRA','cdr3_TRB','cdr3_lst_TRB',
    'umi_count_TRA','umi_count_lst_TRA','umi_count_TRB','umi_count_lst_TRB', #'cell_flag',
    'peptide_HLA','peptide_HLA_lst','umi_count_mhc','umi_count_lst_mhc',
    'sample_id', 'sample_id_lst', 'umi_count_cd8', 'umi_count_lst_cd8', 'HLA_cd8','HLA_match', #'HLA_lst_cd8','HLA_pool_cd8',
    'valid_ct','ct_pep',
    'VDJdb_pep',
    'VDJdb_check',
]
OUT_COL += ['delta_umi_mhc']

# Load

In [88]:
df = pd.read_csv(DATA, converters=converters)

In [89]:
thr = pd.read_csv(THR, header=None, index_col=0, names=['thr'])

In [90]:
hla_match = df.apply(lambda row: row.HLA_mhc in row.HLA_cd8, axis=1)
df['HLA_match'] = hla_match # exp10x #OBS!

In [91]:
# exp13! OBS
thr.thr.delta_umi_mhc = 1
thr.thr.delta_umi_TRA = 0
thr.thr.delta_umi_TRB = 0

In [92]:
thr.thr

umi_count_mhc        2.0
delta_umi_mhc        1.0
umi_count_mhc_rel    NaN
umi_count_cd8        NaN
delta_umi_cd8        NaN
umi_count_TRA        0.0
delta_umi_TRA        0.0
umi_count_TRB        0.0
delta_umi_TRB        0.0
Name: thr, dtype: float64

## Convert list-variables to strings

In [93]:
lst_cols = df.columns[df.columns.str.contains('lst')].to_list() + ['HLA_cd8']

In [94]:
for col in lst_cols:
    df[col] = df[col].fillna('').apply(lambda x: "|".join([str(e) for e in x])).replace('', np.nan)

In [95]:
raw = df[OUT_COL]

In [96]:
umi_filter = ((df.umi_count_mhc >= thr.thr.umi_count_mhc) &
              (df.delta_umi_mhc >= thr.thr.delta_umi_mhc) &
              (df.umi_count_TRA >= thr.thr.umi_count_TRA) &
              (df.delta_umi_TRA >= thr.thr.delta_umi_TRA) &
              (df.delta_umi_TRB >= thr.thr.delta_umi_TRB)
             )

In [97]:
flt = df.loc[umi_filter, OUT_COL]

In [98]:
hla = df.loc[umi_filter & hla_match, OUT_COL]

In [99]:
tcr = df.loc[umi_filter & hla_match & (~df.cdr3_TRA.isna() & ~df.cdr3_TRB.isna()), OUT_COL]

In [100]:
print(len(raw), raw.gem.unique().shape)
print(len(flt), flt.gem.unique().shape)
print(len(hla), hla.gem.unique().shape)
print(len(tcr), tcr.gem.unique().shape)

6073 (6073,)
4986 (4986,)
4135 (4135,)
2833 (2833,)


In [101]:
tcr[tcr.peptide_HLA.str.startswith('RVR')].groupby('ct').size().value_counts()[1]

605

In [102]:
raw.ct.unique().shape

(2441,)

In [103]:
flt.ct.unique().shape

(2060,)

In [104]:
hla.ct.unique().shape

(1494,)

In [105]:
tcr.ct.unique().shape

(1031,)

# Write output

In [172]:
raw.to_csv(OUT_DIR + 'raw.csv', index=False)

In [173]:
flt.to_csv(OUT_DIR + 'opt_thr.csv', index=False)

In [174]:
hla.to_csv(OUT_DIR + 'hla_match.csv', index=False)

In [175]:
tcr.to_csv(OUT_DIR + 'tcr.csv', index=False)

# Counts per pMHC

In [28]:
tcr.groupby(['peptide_HLA']).ct.unique().apply(len)

peptide_HLA
CLGGLLTMV A0201       26
FLYALALLL A0201       18
GILGFVFTL A0201        2
GLCTLVAML A0201        5
NLVPMVATV A0201        4
RPHERNGFTV B0702      14
RPHERNGFTVL B0702     40
RPPIFIRRL B0702       21
RVRAYTYSK A0301      691
TPRVTGGGAM B0702     135
TPSVSSSISSL B0702     19
VLEETSVML A0201       20
YVLDHLIVV A0201       89
Name: ct, dtype: int64

In [36]:
tcr.groupby('peptide_HLA').gem.size()

peptide_HLA
CLGGLLTMV A0201      106
FLYALALLL A0201       33
GILGFVFTL A0201        2
GLCTLVAML A0201        5
NLVPMVATV A0201        4
RPHERNGFTV B0702      14
RPHERNGFTVL B0702    274
RPPIFIRRL B0702      110
RVRAYTYSK A0301      965
TPRVTGGGAM B0702     603
TPSVSSSISSL B0702     37
VLEETSVML A0201      402
YVLDHLIVV A0201      278
Name: gem, dtype: int64

In [60]:
tcr[tcr.peptide_HLA == "TPRVTGGGAM B0702"].genes_TRA.str.split(';', expand=True)[0].value_counts()

TRAV14/DV4      194
TRAV17          174
TRAV3            78
TRAV21           26
TRAV8-2          16
TRAV19           12
TRAV2            12
TRAV12-3         10
TRAV1-2           8
TRAV38-2/DV8      7
TRAV12-2          5
TRAV13-1          5
TRAV25            5
TRAV8-3           5
TRAV12-1          5
TRAV29/DV5        4
TRAV30            4
TRAV13-2          4
TRAV5             3
TRAV9-2           3
TRAV10            2
TRAV20            2
TRAV26-2          2
TRAV8-6           2
TRAV22            2
TRAV16            2
TRAV26-1          2
TRAV35            2
TRAV34            1
TRAV36/DV7        1
TRAV38-1          1
TRAV41            1
TRAV1-1           1
TRAV24            1
TRAV27            1
Name: 0, dtype: int64

In [64]:
clones = tcr.loc[tcr.peptide_HLA == "TPRVTGGGAM B0702", ["ct","genes_TRA", "genes_TRB"]].drop_duplicates(subset=["genes_TRA", "genes_TRB"])
clones.sort_values(by='genes_TRA')

Unnamed: 0,ct,genes_TRA,genes_TRB
3392,3.0,TRAV1-1;TRAJ10;TRAC,TRBV5-1;;TRBJ2-5;TRBC2
3406,2229.0,TRAV1-2;TRAJ17;TRAC,TRBV2;;TRBJ2-5;TRBC2
696,77.0,TRAV1-2;TRAJ17;TRAC,TRBV20-1;;TRBJ1-2;TRBC1
2919,288.0,TRAV1-2;TRAJ24;TRAC,TRBV6-1;TRBD1;TRBJ1-1;TRBC1
3613,421.0,TRAV1-2;TRAJ40;TRAC,TRBV30;TRBD1;TRBJ1-1;TRBC1
...,...,...,...
3174,833.0,TRAV8-6;TRAJ30;TRAC,TRBV6-1;;TRBJ2-7;TRBC2
870,5.0,TRAV8-6;TRAJ45;TRAC,TRBV28;;TRBJ2-7;TRBC2
2139,4335.0,TRAV9-2;TRAJ13;TRAC,TRBV6-5;;TRBJ2-1;TRBC2
3445,329.0,TRAV9-2;TRAJ17;TRAC,TRBV5-6;TRBD1;TRBJ1-5;TRBC1


In [65]:
f = clones.apply(lambda row: ('TRBJ2-7' in row.genes_TRB), axis=1)
clones[f]

Unnamed: 0,ct,genes_TRA,genes_TRB
71,32.0,TRAV17;TRAJ12;TRAC,TRBV7-9;TRBD1;TRBJ2-7;TRBC2
176,34.0,TRAV17;TRAJ12;TRAC,TRBV7-9;;TRBJ2-7;TRBC2
320,1053.0,TRAV34;TRAJ30;TRAC,TRBV7-9;;TRBJ2-7;TRBC2
463,876.0,TRAV12-2;TRAJ5;TRAC,TRBV5-1;TRBD1;TRBJ2-7;TRBC2
548,1137.0,TRAV13-1;TRAJ11;TRAC,TRBV19;TRBD1;TRBJ2-7;TRBC2
649,164.0,TRAV25;TRAJ35;TRAC,TRBV27;;TRBJ2-7;TRBC2
665,259.0,TRAV12-3;TRAJ44;TRAC,TRBV27;;TRBJ2-7;TRBC2
870,5.0,TRAV8-6;TRAJ45;TRAC,TRBV28;;TRBJ2-7;TRBC2
1246,366.0,TRAV19;TRAJ53;TRAC,TRBV27;;TRBJ2-7;TRBC2
1291,13.0,TRAV16;TRAJ18;TRAC,TRBV14;;TRBJ2-7;TRBC2


In [52]:
tcr[tcr.peptide_HLA == "RPHERNGFTVL B0702"].groupby('ct').size()

ct
3.0       146
12.0       36
16.0       14
33.0       18
35.0        1
61.0        8
62.0        3
74.0        5
122.0       1
126.0       3
166.0       3
175.0       5
186.0       1
194.0       2
197.0       1
212.0       2
240.0       1
361.0       2
388.0       1
606.0       1
794.0       1
846.0       1
905.0       1
907.0       1
914.0       1
1215.0      1
1403.0      1
1582.0      1
1755.0      1
1756.0      1
1761.0      1
1813.0      1
1825.0      1
2126.0      1
2552.0      1
2709.0      1
3538.0      1
4610.0      1
4713.0      1
5225.0      1
dtype: int64

# Negative GEM count

### Database matches

In [44]:
db = pd.read_csv("../../experiments/exp13/run5/res/tables/tcr_barcode.valid.csv")

In [51]:
db.loc[
    db.peptide_HLA.isin(['NLVPMVATV A0201', 'GILGFVFTL A0201','GLCTLVAML A0201']),
    ["ct", "peptide_HLA", 'tcrdb_pep', 'tcrdb_check']
].dropna()

Unnamed: 0,ct,peptide_HLA,tcrdb_pep,tcrdb_check
2166,1984.0,NLVPMVATV A0201,['NLVPMVATV'],True
2219,478.0,GLCTLVAML A0201,['GLCTLVAML'],True
2706,278.0,GLCTLVAML A0201,['GLCTLVAML'],True
3671,1985.0,NLVPMVATV A0201,['NLVPMVATV'],True


In [53]:
raw.loc[~raw.VDJdb_check.isna(), ['ct','peptide_HLA','VDJdb_pep','VDJdb_check','umi_count_mhc']]

Unnamed: 0,ct,peptide_HLA,VDJdb_pep,VDJdb_check,umi_count_mhc
543,1140.0,FLYALALLL A0201,['FLYALALLL'],True,17.0
1015,76.0,FLYALALLL A0201,['FLYALALLL'],True,38.0
1696,76.0,FLYALALLL A0201,['FLYALALLL'],True,22.0
1870,1984.0,NLVPMVATV A0201,['NLVPMVATV'],True,26.0
1894,76.0,FLYALALLL A0201,['FLYALALLL'],True,46.0
1916,478.0,GLCTLVAML A0201,['GLCTLVAML'],True,23.0
2332,278.0,GLCTLVAML A0201,['GLCTLVAML'],True,23.0
2947,76.0,FLYALALLL A0201,['FLYALALLL'],True,33.0
3131,1985.0,NLVPMVATV A0201,['NLVPMVATV'],True,24.0
3337,574.0,RVRAYTYSK A0301,['GLCTLVAML'],False,11.0


In [41]:
raw.loc[
    raw.peptide_HLA.isin(['NLVPMVATV A0201', 'GILGFVFTL A0201','GLCTLVAML A0201']),
    ['ct','peptide_HLA','VDJdb_pep','VDJdb_check','umi_count_mhc']
]

Unnamed: 0,ct,peptide_HLA,VDJdb_pep,VDJdb_check,umi_count_mhc
492,28.0,NLVPMVATV A0201,,,1.0
521,848.0,GILGFVFTL A0201,,,26.0
722,472.0,NLVPMVATV A0201,,,9.0
795,604.0,GILGFVFTL A0201,,,9.0
837,2302.0,GLCTLVAML A0201,,,21.0
1024,573.0,GLCTLVAML A0201,,,15.0
1269,2190.0,GLCTLVAML A0201,,,1.0
1478,225.0,GLCTLVAML A0201,,,4.0
1870,1984.0,NLVPMVATV A0201,['NLVPMVATV'],True,26.0
1916,478.0,GLCTLVAML A0201,['GLCTLVAML'],True,23.0


In [42]:
flt.loc[
    flt.peptide_HLA.isin(['NLVPMVATV A0201', 'GILGFVFTL A0201','GLCTLVAML A0201']),
    ['ct','peptide_HLA','VDJdb_pep','VDJdb_check']
]

Unnamed: 0,ct,peptide_HLA,VDJdb_pep,VDJdb_check
521,848.0,GILGFVFTL A0201,,
722,472.0,NLVPMVATV A0201,,
795,604.0,GILGFVFTL A0201,,
837,2302.0,GLCTLVAML A0201,,
1024,573.0,GLCTLVAML A0201,,
1478,225.0,GLCTLVAML A0201,,
1870,1984.0,NLVPMVATV A0201,['NLVPMVATV'],True
1916,478.0,GLCTLVAML A0201,['GLCTLVAML'],True
2332,278.0,GLCTLVAML A0201,['GLCTLVAML'],True
3029,22.0,GLCTLVAML A0201,,


In [43]:
hla.loc[
    hla.peptide_HLA.isin(['NLVPMVATV A0201', 'GILGFVFTL A0201','GLCTLVAML A0201']),
    ['ct','peptide_HLA','VDJdb_pep','VDJdb_check']
]

Unnamed: 0,ct,peptide_HLA,VDJdb_pep,VDJdb_check
521,848.0,GILGFVFTL A0201,,
722,472.0,NLVPMVATV A0201,,
795,604.0,GILGFVFTL A0201,,
837,2302.0,GLCTLVAML A0201,,
1024,573.0,GLCTLVAML A0201,,
1478,225.0,GLCTLVAML A0201,,
1870,1984.0,NLVPMVATV A0201,['NLVPMVATV'],True
1916,478.0,GLCTLVAML A0201,['GLCTLVAML'],True
2332,278.0,GLCTLVAML A0201,['GLCTLVAML'],True
3029,22.0,GLCTLVAML A0201,,


In [40]:
tcr.loc[
    tcr.peptide_HLA.isin(['NLVPMVATV A0201', 'GILGFVFTL A0201','GLCTLVAML A0201']),
    ['ct','peptide_HLA','VDJdb_pep','VDJdb_check']
].dropna()

Unnamed: 0,ct,peptide_HLA,VDJdb_pep,VDJdb_check
1870,1984.0,NLVPMVATV A0201,['NLVPMVATV'],True
1916,478.0,GLCTLVAML A0201,['GLCTLVAML'],True
2332,278.0,GLCTLVAML A0201,['GLCTLVAML'],True
3131,1985.0,NLVPMVATV A0201,['NLVPMVATV'],True


### Counting GEMs

In [106]:
raw.peptide_HLA.unique()

array(['RPHERNGFTVL B0702', 'RVRAYTYSK A0301', 'CLGGLLTMV A0201',
       'TPRVTGGGAM B0702', 'YVLDHLIVV A0201', 'TPSVSSSISSL B0702',
       'VLEETSVML A0201', 'RPHERNGFTV B0702', 'RPPIFIRRL B0702',
       'FLYALALLL A0201', 'NLVPMVATV A0201', 'GILGFVFTL A0201',
       'GLCTLVAML A0201'], dtype=object)

In [117]:
query_peps = ['NLVPMVATV A0201', 'GILGFVFTL A0201','GLCTLVAML A0201']
query_peps = ['CLGGLLTMV A0201']

In [118]:
f1 = raw.peptide_HLA_lst.apply(lambda x: any([p in x for p in query_peps]))
f2 = raw.peptide_HLA.isin(query_peps)

In [119]:
def get_idx(list1, list2):
    return max([xi for (xi, x) in enumerate(list1) for (xp, y) in enumerate(list2) if x==y])

In [120]:
def get_val(list1, list2):
    return ", ".join([x for (xi, x) in enumerate(list1) for (xp, y) in enumerate(list2) if x==y] + [list1[-1]])

In [121]:
neg = raw[~f2 & f1].copy()

In [122]:
neg['pep_idx'] = neg.peptide_HLA_lst.str.split('|').apply(lambda x: get_idx(x, query_peps))

In [123]:
neg['pep'] = neg.peptide_HLA_lst.str.split('|').apply(lambda x: get_val(x, query_peps))
neg['pep_umi'] = neg.apply(lambda row: ", ".join([row.umi_count_lst_mhc.split('|')[i] for i in [int(row.pep_idx), -1]]), axis=1)
neg['pep_del'] = neg.pep_umi.str.split(", ").apply(lambda x: float(x[-1])/(float(x[0])+0.25))

In [131]:
neg[~f2 & f1].apply(lambda row: row.umi_count_lst_mhc.split('|')[int(row.pep_idx)], axis=1).value_counts()

  """Entry point for launching an IPython kernel.


1.0     523
2.0      78
3.0       8
5.0       2
9.0       2
8.0       2
10.0      1
6.0       1
16.0      1
dtype: int64

In [147]:
neg.loc[neg.pep_del > 0.89, ['pep','pep_umi','pep_del', 'HLA_cd8']].shape #.sort_values(by='pep_del').head(20)

(565, 4)

In [148]:
565+53

618

In [130]:
raw[f2]#.ct.unique())

Unnamed: 0,gem,clonotype,ct,genes_TRA,genes_lst_TRA,genes_TRB,genes_lst_TRB,cdr3_TRA,cdr3_lst_TRA,cdr3_TRB,...,sample_id_lst,umi_count_cd8,umi_count_lst_cd8,HLA_cd8,HLA_match,valid_ct,ct_pep,VDJdb_pep,VDJdb_check,delta_umi_mhc
3,AAACCTGCAGCCAGAA-1,clonotype79,79.0,TRAV21;TRAJ33;TRAC,TRAV21;TRAJ33;TRAC,TRBV10-2;;TRBJ1-1;TRBC1,TRBV10-2;;TRBJ1-1;TRBC1,CAVLMDSNYQLIW,CAVLMDSNYQLIW,CASSADGMNTEAFF,...,"6, 7, 2, 8, 10, 9, 1",4645.0,4.0|4.0|5.0|7.0|15.0|16.0|4645.0,A0201|B0702,True,False,,,,15.200000
27,AAAGATGGTGCAGGTA-1,clonotype6,6.0,TRAV25;TRAJ28;TRAC,TRAV25;TRAJ28;TRAC,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,CAVSGAGSYQLTF,CAVSGAGSYQLTF,CASSLEGQASSYEQYF,...,"3, 5, 7, 2, 6, 1, 8, 9, 4, 10",1492.0,2.0|5.0|11.0|12.0|12.0|15.0|17.0|18.0|49.0|1492.0,A0201|A0301|B0702,True,True,CLGGLLTMV A0201,,,10.153846
44,AAATGCCAGAACAACT-1,clonotype36,36.0,TRAV38-1;TRAJ7;TRAC,TRAV38-1;TRAJ7;TRAC,TRBV20-1;;TRBJ2-1;TRBC2,TRBV20-1;;TRBJ2-1;TRBC2,CAFNAVDGNNRLAF,CAFNAVDGNNRLAF,CSAEEWTSGYNEQFF,...,"3, 1, 2, 5, 7, 9, 8, 6, 10, 4",473.0,1.0|2.0|2.0|2.0|3.0|3.0|4.0|5.0|16.0|473.0,A0301|B0702,False,False,CLGGLLTMV A0201,,,3.200000
91,AACTCCCCATGGATGG-1,clonotype6,6.0,TRAV25;TRAJ28;TRAC,TRAV25;TRAJ28;TRAC,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,CAVSGAGSYQLTF,CAVSGAGSYQLTF,CASSLEGQASSYEQYF,...,"3, 4, 5, 2, 9, 7, 6, 8, 1, 10",1415.0,1.0|1.0|1.0|3.0|3.0|4.0|5.0|5.0|10.0|1415.0,A0201|A0301|B0702,True,True,CLGGLLTMV A0201,,,16.444444
132,AAGACCTCAACACCTA-1,clonotype6,6.0,TRAV25;TRAJ28;TRAC,TRAV25;TRAJ28;TRAC,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,CAVSGAGSYQLTF,CAVSGAGSYQLTF,CASSLEGQASSYEQYF,...,"5, 3, 6, 7, 8, 9, 2, 1, 10",2461.0,1.0|2.0|3.0|4.0|4.0|6.0|11.0|12.0|2461.0,A0201|A0301|B0702,True,True,CLGGLLTMV A0201,,,26.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5898,TGCTGCTAGGTGCTAG-1,,4744.0,,,TRBV4-2;;TRBJ2-7;TRBC2,TRBV4-2;;TRBJ2-7;TRBC2,,,CASSQDVASGSYEQYF,...,"9, 2, 3, 5, 4, 6, 7, 8, 1, 10",36.0,1.0|2.0|2.0|2.0|3.0|4.0|6.0|6.0|9.0|36.0,A0201|A0301|B0702,True,False,,,,4.000000
5914,TGGCGCAGTGGACGAT-1,clonotype6,6.0,,,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,,,CASSLEGQASSYEQYF,...,"3, 9, 7, 2, 6, 1, 8, 10",909.0,1.0|2.0|3.0|4.0|6.0|8.0|10.0|909.0,A0201|A0301|B0702,True,True,CLGGLLTMV A0201,,,15.111111
5992,TTCTACAAGTTTAGGA-1,clonotype6,6.0,,,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,,,CASSLEGQASSYEQYF,...,"4, 5, 3, 6, 7, 8, 9, 1, 2, 10",881.0,1.0|1.0|2.0|2.0|2.0|2.0|2.0|3.0|3.0|881.0,A0201|A0301|B0702,True,True,CLGGLLTMV A0201,,,18.400000
5993,TTCTACACAAACGCGA-1,clonotype6,6.0,,,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,TRBV5-1;TRBD1;TRBJ2-7;TRBC2,,,CASSLEGQASSYEQYF,...,"2, 6, 4, 8, 1, 3, 9, 10",1404.0,2.0|3.0|4.0|4.0|6.0|6.0|8.0|1404.0,A0201|A0301|B0702,True,True,CLGGLLTMV A0201,,,20.800000


In [125]:
91/107

0.8504672897196262

In [72]:
raw[raw.peptide_HLA.isin(['NLVPMVATV A0201', 'GILGFVFTL A0201','GLCTLVAML A0201'])].groupby('peptide_HLA').gem.size()

peptide_HLA
GILGFVFTL A0201     4
GLCTLVAML A0201    17
NLVPMVATV A0201    12
Name: gem, dtype: int64

## Distribution of outliers

In [33]:
import sys  
sys.path.insert(0, '../../scripts')

from D_plot_specificity_matrix_utils import calc_binding_concordance

In [27]:
def calc_binding_concordance(df):
    df['clonotype_size'] = df.ct.map(df.groupby('ct').size())
    df['peptide_dist'] = df.set_index(['ct', 'peptide_HLA']).index.map(df.groupby(['ct','peptide_HLA']).size())
    df['binding_concordance'] = df.peptide_dist / df.clonotype_size
    return df

In [35]:
cbc = calc_binding_concordance(tcr, 'ct')

In [40]:
cbc['label'] = np.where(cbc.binding_concordance > 0.5, 'normal','outlier')

In [30]:
cbc.groupby(['peptide_HLA','label']).size()

peptide_HLA        label  
CLGGLLTMV A0201    normal     103
                   outlier      3
FLYALALLL A0201    normal      32
                   outlier      1
GILGFVFTL A0201    normal       2
GLCTLVAML A0201    normal       3
                   outlier      2
NLVPMVATV A0201    normal       4
RPHERNGFTV B0702   normal      13
                   outlier      1
RPHERNGFTVL B0702  normal     272
                   outlier      2
RPPIFIRRL B0702    normal     107
                   outlier      3
RVRAYTYSK A0301    normal     936
                   outlier     29
TPRVTGGGAM B0702   normal     580
                   outlier     23
TPSVSSSISSL B0702  normal      30
                   outlier      7
VLEETSVML A0201    normal     401
                   outlier      1
YVLDHLIVV A0201    normal     273
                   outlier      5
dtype: int64

In [41]:
sum(cbc.label == 'outlier')

77

In [51]:
sum(cbc[cbc.label == 'outlier'].peptide_HLA_lst.str.split('|').apply(len) > 1)

72

In [54]:
cbc['max_conc'] = cbc.ct.map(cbc.groupby(['ct']).binding_concordance.max())

In [82]:
dct = cbc[cbc.binding_concordance == cbc.max_conc].set_index('ct').peptide_HLA.to_dict()
cbc['pep_conc'] = cbc.ct.map(dct)

In [86]:
cbc[cbc.label=='outlier'].apply(lambda row: row.pep_conc in row.peptide_HLA_lst.split('|'), axis=1).sum()

67

In [87]:
67/72

0.9305555555555556

In [92]:
cbc[cbc.label=='outlier'].apply(lambda row: all([row.pep_conc.split()[-1] in pep_hla for pep_hla in row.peptide_HLA_lst.split('|')]), axis=1).sum()

5

In [93]:
5/72

0.06944444444444445

In [94]:
cbc[cbc.label=='outlier'].apply(lambda row: all([pep_hla.split()[-1] in row.HLA_cd8 for pep_hla in row.peptide_HLA_lst.split('|')]), axis=1).sum()

59

In [95]:
59/72

0.8194444444444444

In [46]:
len(cbc[(cbc.peptide_HLA == 'TPRVTGGGAM B0702') & (cbc.label=='normal')])

580

In [42]:
len(cbc[(cbc.peptide_HLA == 'RVRAYTYSK A0301') & (cbc.label=='normal')])

936

In [43]:
sum(cbc.label=='normal')

2756

In [96]:
len(cbc[(cbc.peptide_HLA == 'RVRAYTYSK A0301') & (cbc.label=='outlier')])/sum(cbc.label=='outlier')

0.37662337662337664

In [97]:
len(cbc[(cbc.peptide_HLA == 'TPRVTGGGAM B0702') & (cbc.label=='outlier')])/sum(cbc.label=='outlier')

0.2987012987012987

# Count singlet tail

In [313]:
peps = ['RVR','YVL','RPHERNGFTVL', 'RPP','TPR']
peps = raw.peptide_HLA.unique()
summary = pd.DataFrame(index=peps, columns=['singlet freq (fig. 6a)','singlet freq (fig. 6b)','singlet freq (fig. 6c)'])
for pep in peps:
    for df,name in zip([raw, hla, tcr], ['fig. 6a', 'fig. 6b', 'fig. 6c']):
        counts = df[df.peptide_HLA == pep].groupby('ct').size().value_counts()
        #summary.loc[pep, f"{name}_s"] = counts[1]
        #summary.loc[pep, f"{name}_t"] = counts.sum()
        summary.loc[pep, f"singlet freq ({name})"] = round(counts[1]/counts.sum(), 2)
summary

Unnamed: 0,singlet freq (fig. 6a),singlet freq (fig. 6b),singlet freq (fig. 6c)
RPHERNGFTVL B0702,0.73,0.66,0.68
RVRAYTYSK A0301,0.83,0.86,0.88
CLGGLLTMV A0201,0.78,0.76,0.77
TPRVTGGGAM B0702,0.79,0.74,0.77
YVLDHLIVV A0201,0.7,0.66,0.66
TPSVSSSISSL B0702,0.87,0.78,0.74
VLEETSVML A0201,0.83,0.77,0.7
RPHERNGFTV B0702,0.95,1.0,1.0
RPPIFIRRL B0702,0.89,0.82,0.81
FLYALALLL A0201,0.76,0.68,0.72


# Inspect clonotypes of multiplet chains

In [232]:
all_cts = raw.dropna(subset=['cdr3_TRA','cdr3_TRB']).groupby('ct').cdr3_TRA.unique().apply(len)
cts = all_cts[all_cts > 1].index
cts

Int64Index([    2,     7,    19,    22,    26,    55,    60,    66,    68,
               73,
            ...
            69484, 69495, 69536, 69573, 69617, 69704, 69714, 69719, 69747,
            69768],
           dtype='int64', name='ct', length=323)

In [233]:
for ct in cts:
    a = raw[raw.ct == ct].cdr3_TRA.unique()
    b = raw[raw.ct == ct].cdr3_TRB.unique()
    print(a,b)

['CAVGDNFNKFYF' 'CAARVRGFGNVLHC'] ['CASSLYSATGELFF']
['CAVFLYGNNRLAF' 'CAASGYDYKLSF'] ['CSVSASGGDEQYF']
['CAAWDMEYGNKLVF' 'CAASVSIWTGTASKLTF'] ['CAISDPGLAGGGGEQFF']
['CAENGGGGADGLTF' 'CAASGNHDMRF'] ['CASSTTAGDTEAFF']
['CAASMDSNYQLIW' 'CAVGEYDYKLSF'] ['CASSESISYEQYF']
['CAFMTNAGGTSYGKLTF' 'CAVAVGVSGGGADGLTF'] ['CASSQGAYGYTF']
['CAALSHASKIIF' 'CAVEDGGFQKLVF'] ['CASRPQQGHNSPLHF']
['CVVSANSDGQKLLF' 'CASIAGTYKYIF'] ['CASSVVSGANVLTF']
['CAVKGRDDKIIF' 'CAMSAHNYGQNFVF'] ['CSARDPPAGRDGYTF']
['CAVRNYGGATNKLIF' 'CALSEANTGGFKTIF'] ['CASSLTTGGRNEQFF']
['CAAPRGGQNFVF' 'CAVGALNNDMRF'] ['CASSLRTGGFPSGTSGSTDTQYF']
['CAPHKAAGNKLTF' 'CAVGASGGSYIPTF'] ['CAAGGGGTEKLFF']
['CDPPNAGKSTF' 'CAESSGGSYIPTF'] ['CASRGGQISYEQYF']
['CATEGASGGSYIPTF' 'CAVRDMSGARGAGSYQLTF'] ['CAWVLGPAGDTQYF']
['CLVGDMRSGAGSYQLTF' 'CVVSRQEQQTRF'] ['CSARDINRGSYEQYF']
['CAMREGIGNAGNMLTF' 'CAVGAETSYDKVIF'] ['CASTQGGGTGSFYEQYF']
['CAVGSSGGSYIPTF' 'CLSPMDTGRRALTF'] ['CATSTGDSNQPQHF']
['CAVSRYNNNDMRF' 'CAMSDLNSGGYQKVTF'] ['CASSHQDLGTEGTQYF']
