Investigating the effects of clustering hashing UMI to identify valid thresholds per hash ID. The inquiry came from the worry that some IDs were present at very high levels and therefore would also have a very high rate of noise which might overshadow IDs with a generally lower presence. However, as it turns out, the hashing counts are clearly separated. 2488 GEMs had been annotated with credible peptides. Using thresholds on hashing only improved annotation of 2 GEMs while 7 GEMs were lost. The loss however might be acceptable since they had hashing below 24 UMI, which is quite low. Both methods were wrong in 241 cases (and they agreed to be wrong).

In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import matplotlib.pyplot as plt

In [2]:
def HLA_cd8_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace(",", "").replace("'","").split(" ")

def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def epitope_converter(x):
    #define format of datetime
    return [y for y in x.replace("[","").replace("]","").replace("\n","").split("'") if (y != '') & (y != ' ')]

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("\n","").replace("'",""))

def literal_converter(val):
    # replace NaN with '' and perform literal eval on the rest
    return [] if val == '' else literal_eval(val)

converters = {'peptide_HLA_lst': peptide_hla_converter,
              'umi_count_lst_mhc': literal_eval,
              'umi_count_lst_TRA': literal_converter,'umi_count_lst_TRB': literal_converter,
              'cdr3_lst_TRA': cdr3_lst_converter,
              'cdr3_lst_TRB': cdr3_lst_converter,
              'HLA_lst_mhc': cdr3_lst_converter,'HLA_cd8': HLA_cd8_converter} #
converters = {'peptide_HLA_lst': peptide_hla_converter,
              'umi_count_lst_mhc': literal_eval,
              'umi_count_lst_cd8': literal_converter,
              'umi_count_lst_TRA': literal_converter,'umi_count_lst_TRB': literal_converter,
              'cdr3_lst_TRA': cdr3_lst_converter,
              'cdr3_lst_TRB': cdr3_lst_converter,
              'HLA_lst_mhc': cdr3_lst_converter,
              'HLA_pool_cd8':cdr3_lst_converter,
              'HLA_cd8': HLA_cd8_converter,
              'HLA_lst_cd8':literal_converter,'sample_id_lst':literal_converter} #

In [3]:
import sys  
sys.path.insert(0, '../scripts')

from D_plot_specificity_matrix_utils import (peptide_per_clonotype_by_gem_size,
                                             multiple_peptides_per_gem_w_filtering,
                                             calc_binding_concordance,
                                             epitope_sorter_index,
                                             peptides_per_gem)

In [4]:
cluster_peptides = pd.read_csv('peptide_clusters.csv', index_col=0)
cluster_hashing = pd.read_csv('hashing_clusters.csv', index_col=0)

In [5]:
OS1 = '../experiments/exp13/run1/cat/eval_clonotypes/valid_ct.csv'
OS2 = '../experiments/exp13/run2/cat/eval_clonotypes/valid_ct.csv'#cleaned.

In [6]:
os1 = pd.read_csv(OS1, converters=converters)
os2 = pd.read_csv(OS2, converters=converters)

In [7]:
os1.rename(columns={'rank':'epitope_rank'},inplace=True)
os2.rename(columns={'rank':'epitope_rank'},inplace=True)

In [8]:
lol = os1.merge(cluster_peptides, left_on='gem', right_index=True).merge(cluster_hashing, left_on='gem', right_index=True, suffixes=['','_hsh'])

In [9]:
#lol = lol[~lol.clusters.isin([5,8])].copy()

In [10]:
hsh_conversion = {251:1, 252:2, 253:3, 254:4, 255:5, 256:6,257:7,258:8,259:9,260:10}
lol['hash_label'] = lol.hsh_label.map(hsh_conversion)

In [11]:
lol['HLA_match_per_gem'] = lol.apply(lambda row: row.peptide_HLA.split(' ')[-1] in row.HLA_cd8, axis=1)
lol['HLA_match_per_ct'] = lol.fillna('').apply(lambda row: row.ct_pep.split(' ')[-1] in row.HLA_cd8, axis=1)

In [12]:
lol.HLA_cd8 = lol.HLA_cd8.apply(lambda x: ' '.join(x))

In [36]:
dct = lol.groupby(['sample_id']).HLA_cd8.unique().apply(lambda x: x[0])
lol['HLA_hsh'] = lol.hash_label.map(dct)
#lol['HLA_hsh'] = lol.top_cluster_peptide_hsh.astype(int).apply(lambda x: x-250).map(dct)

In [35]:
lol.top_cluster_peptide_hsh.astype(float).apply(lambda x: x-250)

0        6.0
1       10.0
2        8.0
3        1.0
4        7.0
        ... 
7105     6.0
7106    10.0
7108     9.0
7109     1.0
7110    10.0
Name: top_cluster_peptide_hsh, Length: 5107, dtype: float64

In [37]:
def get_matching_hla(row, var='HLA_hsh'):
    if row[var] != row[var]:
        return 0
    if row.ct_pep == row.ct_pep:
        if row.ct_pep.split()[-1] in row[var]:
            return 1
        else:
            return 0
    else:
        if row.peptide_HLA.split()[-1] in row[var]:
            return 1
        else:
            return 0

In [38]:
lol.dropna(subset=['ct_pep']).apply(lambda row: get_matching_hla(row), axis=1).sum() #.dropna(subset=['ct_pep'])

2029

In [39]:
lol.dropna(subset=['ct_pep']).apply(lambda row: get_matching_hla(row, var='HLA_cd8'), axis=1).sum() #.dropna(subset=['ct_pep'])

2245

In [40]:
lol.dropna(subset=['ct_pep']).shape

(2488, 123)

In [41]:
lol.loc[(lol.clusters_hsh == 0) & (lol.apply(lambda row: get_matching_hla(row), axis=1) == 0),
        ['ct','sample_id','sample_id_lst','umi_count_lst_cd8','HLA_cd8','peptide_HLA','umi_count_lst_mhc','ct_pep','ct_hla','clusters_hsh',
         'top_cluster_peptide_hsh','hash_label','HLA_hsh']].sort_values(by='ct').tail(60)

Unnamed: 0,ct,sample_id,sample_id_lst,umi_count_lst_cd8,HLA_cd8,peptide_HLA,umi_count_lst_mhc,ct_pep,ct_hla,clusters_hsh,top_cluster_peptide_hsh,hash_label,HLA_hsh
6061,4547.0,7.0,"[2, 3, 9, 6, 1, 10, 8, 7]","[1.0, 2.0, 4.0, 7.0, 13.0, 18.0, 66.0, 520.0]",B0702,TPRVTGGGAM B0702,[2.0],,,0,260,7.0,A0201 A0301
4041,4595.0,10.0,"[3, 5, 4, 9, 2, 6, 7, 1, 8, 10]","[2.0, 2.0, 3.0, 3.0, 4.0, 5.0, 8.0, 14.0, 416....",A0201 A0301,TPRVTGGGAM B0702,[2.0],,,0,260,10.0,A0201 A0301
2858,4961.0,6.0,"[3, 2, 4, 5, 9, 1, 8, 7, 10, 6]","[1.0, 5.0, 5.0, 5.0, 5.0, 8.0, 8.0, 12.0, 15.0...",A0201 B0702,RPHERNGFTVL B0702,[2.0],,,0,260,6.0,A0201 A0301
5668,5151.0,10.0,"[3, 5, 2, 7, 8, 6, 1, 10]","[1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 4.0, 364.0]",A0201 A0301,TPRVTGGGAM B0702,"[1.0, 5.0]",TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
6031,5151.0,10.0,"[4, 8, 9, 1, 7, 10]","[1.0, 1.0, 1.0, 2.0, 2.0, 290.0]",A0201 A0301,TPRVTGGGAM B0702,[13.0],TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
6230,5151.0,10.0,"[2, 4, 5, 9, 1, 6, 8, 10]","[1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 133.0]",A0201 A0301,TPRVTGGGAM B0702,[7.0],TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
7074,5151.0,10.0,"[3, 4, 9, 2, 7, 6, 1, 10]","[1.0, 1.0, 1.0, 4.0, 6.0, 9.0, 12.0, 461.0]",A0201 A0301,TPRVTGGGAM B0702,"[1.0, 25.0]",TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
6473,5151.0,10.0,"[4, 8, 9, 1, 2, 6, 7, 10]","[2.0, 2.0, 2.0, 3.0, 6.0, 6.0, 6.0, 312.0]",A0201 A0301,TPRVTGGGAM B0702,"[1.0, 20.0]",TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
5581,5164.0,6.0,"[7, 9, 1, 8, 2, 10, 6]","[1.0, 1.0, 2.0, 5.0, 6.0, 7.0, 885.0]",A0201 B0702,TPRVTGGGAM B0702,[14.0],,,0,260,6.0,A0201 A0301
6953,5167.0,9.0,"[4, 7, 8, 2, 1, 10, 6, 9]","[2.0, 2.0, 5.0, 7.0, 9.0, 32.0, 112.0, 711.0]",B0702,TPRVTGGGAM B0702,"[5.0, 47.0]",TPRVTGGGAM B0702,B0702,0,260,9.0,A0201 A0301


In [24]:
lol.loc[(lol.apply(lambda row: get_matching_hla(row), axis=1) == 0) & (lol.apply(lambda row: get_matching_hla(row, var='HLA_cd8'), axis=1) == 0) &
        ~lol.ct_pep.isna(),
        ['ct','sample_id','sample_id_lst','umi_count_lst_cd8','HLA_cd8','peptide_HLA','ct_pep','ct_hla','clusters_hsh',
         'top_cluster_peptide_hsh','hash_label','HLA_hsh']].sort_values(by='ct').tail(60)

Unnamed: 0,ct,sample_id,sample_id_lst,umi_count_lst_cd8,HLA_cd8,peptide_HLA,ct_pep,ct_hla,clusters_hsh,top_cluster_peptide_hsh,hash_label,HLA_hsh
3985,101.0,2.0,"[3, 9, 8, 5, 4, 1, 6, 10, 7, 2]","[2.0, 2.0, 4.0, 6.0, 7.0, 9.0, 12.0, 25.0, 143...",A0201,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,4,252,2.0,A0201
182,102.0,10.0,"[3, 9, 7, 8, 2, 1, 6, 10]","[2.0, 2.0, 4.0, 4.0, 10.0, 11.0, 11.0, 792.0]",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,2,260,10.0,A0201 A0301
980,102.0,10.0,"[7, 1, 3, 5, 6, 8, 2, 9, 10]","[1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 7.0, 615.0]",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
1088,102.0,10.0,"[5, 1, 2, 3, 7, 8, 6, 10]","[1.0, 4.0, 4.0, 4.0, 7.0, 10.0, 15.0, 1051.0]",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,2,260,10.0,A0201 A0301
2800,102.0,10.0,"[4, 6, 7, 2, 9, 1, 8, 10]","[2.0, 2.0, 2.0, 4.0, 4.0, 6.0, 9.0, 535.0]",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
816,102.0,10.0,"[3, 5, 4, 6, 8, 9, 2, 7, 1, 10]","[1.0, 2.0, 4.0, 7.0, 9.0, 9.0, 11.0, 11.0, 29....",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301
3632,102.0,10.0,"[3, 9, 1, 7, 6, 2, 8, 10]","[1.0, 1.0, 2.0, 5.0, 7.0, 8.0, 27.0, 1286.0]",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,2,260,10.0,A0201 A0301
2682,102.0,10.0,"[4, 3, 2, 9, 8, 7, 1, 6, 10]","[1.0, 2.0, 5.0, 7.0, 10.0, 12.0, 13.0, 20.0, 1...",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,2,260,10.0,A0201 A0301
1889,102.0,10.0,"[3, 5, 9, 7, 4, 8, 1, 2, 6, 10]","[2.0, 2.0, 3.0, 5.0, 6.0, 7.0, 11.0, 13.0, 18....",A0201 A0301,TPRVTGGGAM B0702,TPRVTGGGAM B0702,,2,260,10.0,A0201 A0301
4883,122.0,10.0,"[3, 4, 5, 9, 2, 7, 6, 1, 8, 10]","[2.0, 2.0, 2.0, 2.0, 4.0, 6.0, 7.0, 12.0, 17.0...",A0201 A0301,RVRAYTYSK A0301,TPRVTGGGAM B0702,,0,260,10.0,A0201 A0301


In [66]:
lol.loc[(lol.hash_label.isna()) & ((lol.HLA_match_per_ct == True) | (lol.HLA_match_per_gem == True)),
        ['ct','sample_id','sample_id_lst','umi_count_lst_cd8','HLA_cd8','peptide_HLA','ct_pep','umi_count_lst_mhc','clusters_hsh',
         'top_cluster_peptide_hsh','hash_label']].sort_values(by='ct').head(60)

Unnamed: 0,ct,sample_id,sample_id_lst,umi_count_lst_cd8,HLA_cd8,peptide_HLA,ct_pep,umi_count_lst_mhc,clusters_hsh,top_cluster_peptide_hsh,hash_label
4140,1.0,10.0,"[5, 2, 4, 7, 8, 9, 6, 1, 10]","[1.0, 2.0, 3.0, 4.0, 4.0, 4.0, 9.0, 10.0, 632.0]","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,"[1.0, 2.0, 12.0]",0,260,
4791,1.0,10.0,"[3, 5, 7, 2, 6, 9, 1, 4, 8, 10]","[1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 36.0,...","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,[14.0],0,260,
4705,1.0,10.0,"[5, 7, 2, 9, 1, 8, 6, 10]","[1.0, 1.0, 2.0, 2.0, 5.0, 6.0, 39.0, 576.0]","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,[16.0],0,260,
4608,1.0,10.0,"[5, 3, 4, 6, 9, 8, 1, 7, 2, 10]","[1.0, 2.0, 2.0, 4.0, 4.0, 5.0, 6.0, 6.0, 7.0, ...","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,"[4.0, 11.0]",0,260,
4507,1.0,10.0,"[4, 2, 8, 9, 1, 6, 7, 10]","[1.0, 3.0, 3.0, 5.0, 6.0, 6.0, 11.0, 630.0]","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,[22.0],0,260,
4463,1.0,10.0,"[1, 3, 4, 7, 2, 6, 8, 9, 10]","[1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 5.0, 5.0, 275.0]","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,"[1.0, 2.0]",0,260,
4351,1.0,10.0,"[3, 5, 8, 4, 2, 9, 7, 6, 1, 10]","[1.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 9.0, 12.0,...","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,[19.0],0,260,
4813,1.0,9.0,"[2, 5, 1, 4, 6, 8, 7, 10, 9]","[2.0, 2.0, 4.0, 4.0, 7.0, 8.0, 98.0, 162.0, 32...",[B0702],TPRVTGGGAM B0702,VLEETSVML A0201,"[1.0, 2.0, 11.0]",0,260,
4326,1.0,10.0,"[4, 7, 2, 5, 8, 9, 6, 1, 10]","[2.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 595.0]","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,"[1.0, 1.0, 7.0]",0,260,
4234,1.0,10.0,"[9, 2, 6, 5, 7, 8, 1, 10]","[3.0, 4.0, 4.0, 5.0, 5.0, 5.0, 16.0, 606.0]","[A0201, A0301]",VLEETSVML A0201,VLEETSVML A0201,"[1.0, 2.0, 17.0]",0,260,


In [46]:
lol.loc[(lol.HLA_match_per_gem != True) & (lol.sample_id != lol.hash_label) & (~lol.hash_label.isna()),
        ['ct','sample_id','sample_id_lst','umi_count_lst_cd8','HLA_cd8','peptide_HLA','ct_pep','umi_count_lst_mhc','clusters_hsh',
         'top_cluster_peptide_hsh','hash_label']].sort_values(by='ct').head(60)

Unnamed: 0,ct,sample_id,sample_id_lst,umi_count_lst_cd8,HLA_cd8,peptide_HLA,ct_pep,umi_count_lst_mhc,clusters_hsh,top_cluster_peptide_hsh,hash_label
2959,5.0,7.0,"[5, 8, 2, 6, 1, 9, 10, 3, 7]","[1.0, 3.0, 6.0, 6.0, 8.0, 8.0, 28.0, 414.0, 94...",[B0702],VLEETSVML A0201,VLEETSVML A0201,"[4.0, 15.0]",0,260,3.0
6412,10.0,10.0,"[2, 9, 6, 1, 7, 8, 5, 10]","[1.0, 1.0, 2.0, 4.0, 5.0, 15.0, 187.0, 558.0]","[A0201, A0301]",RPPIFIRRL B0702,RPPIFIRRL B0702,"[1.0, 4.0, 5.0]",0,260,5.0
3859,15.0,6.0,"[5, 4, 9, 8, 2, 1, 7, 10, 3, 6]","[3.0, 5.0, 6.0, 9.0, 12.0, 13.0, 16.0, 42.0, 2...","[A0201, B0702]",RVRAYTYSK A0301,,[2.0],0,260,3.0
5546,42.0,2.0,"[5, 3, 7, 8, 4, 9, 1, 6, 10, 2]","[1.0, 2.0, 5.0, 5.0, 6.0, 8.0, 9.0, 16.0, 864....",[A0201],RVRAYTYSK A0301,RVRAYTYSK A0301,"[1.0, 2.0]",2,260,10.0
1581,70.0,10.0,"[4, 8, 2, 3, 9, 1, 6, 7, 5, 10]","[1.0, 1.0, 2.0, 3.0, 3.0, 4.0, 5.0, 5.0, 223.0...","[A0201, A0301]",RPPIFIRRL B0702,,"[1.0, 5.0]",0,260,5.0
1891,414.0,2.0,"[1, 4, 7, 8, 9, 6, 3, 10, 2]","[1.0, 2.0, 4.0, 4.0, 4.0, 12.0, 214.0, 421.0, ...",[A0201],RVRAYTYSK A0301,,"[1.0, 1.0, 1.0, 6.0]",0,260,3.0
3729,432.0,2.0,"[5, 3, 4, 1, 6, 7, 8, 10, 9, 2]","[1.0, 2.0, 2.0, 6.0, 6.0, 6.0, 12.0, 22.0, 149...",[A0201],RVRAYTYSK A0301,,"[1.0, 11.0]",5,259,9.0
4250,1313.0,2.0,"[5, 3, 6, 9, 7, 1, 8, 4, 10, 2]","[2.0, 3.0, 10.0, 10.0, 11.0, 12.0, 21.0, 669.0...",[A0201],RVRAYTYSK A0301,,"[1.0, 49.0]",2,260,10.0
7050,1394.0,10.0,"[3, 4, 6, 7, 8, 9, 2, 1, 5, 10]","[1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 6.0, 10.0, 255....","[A0201, A0301]",RPPIFIRRL B0702,,"[3.0, 4.0]",0,260,5.0
439,2088.0,9.0,"[5, 7, 2, 4, 6, 8, 1, 3, 10, 9]","[1.0, 2.0, 3.0, 3.0, 4.0, 6.0, 19.0, 22.0, 810...",[B0702],RVRAYTYSK A0301,,"[1.0, 1.0, 5.0]",2,260,10.0


In [49]:
lol.loc[lol.sample_id == lol.hash_label,
        ['ct','sample_id','HLA_cd8','peptide_HLA','peptide_HLA_lst','umi_count_lst_mhc','clusters','top_cluster_peptide',
         'top_cluster_peptide_hsh','hash_label']].sort_values(by='ct').head(60)

Unnamed: 0,ct,sample_id,HLA_cd8,peptide_HLA,peptide_HLA_lst,umi_count_lst_mhc,clusters,top_cluster_peptide,top_cluster_peptide_hsh,hash_label
5822,1.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[YVLDHLIVV A0201, TPSVSSSISSL B0702, RPHERNGFT...","[1.0, 1.0, 1.0, 9.0]",2,VLEETSVML,260,10.0
2102,1.0,10.0,"[A0201, A0301]",RVRAYTYSK A0301,"[YVLDHLIVV A0201, TPSVSSSISSL B0702, VLEETSVML...","[1.0, 1.0, 18.0, 58.0]",12,RVRAYTYSK,260,10.0
5718,1.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[RVRAYTYSK A0301, VLEETSVML A0201]","[1.0, 13.0]",2,VLEETSVML,260,10.0
922,1.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[RVRAYTYSK A0301, TPSVSSSISSL B0702, VLEETSVML...","[1.0, 1.0, 19.0]",2,VLEETSVML,260,10.0
921,1.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[RVRAYTYSK A0301, VLEETSVML A0201]","[4.0, 7.0]",2,VLEETSVML,260,10.0
3595,1.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[YVLDHLIVV A0201, RVRAYTYSK A0301, TPRVTGGGAM ...","[1.0, 1.0, 1.0, 13.0]",2,VLEETSVML,260,10.0
2142,1.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[RVRAYTYSK A0301, VLEETSVML A0201]","[2.0, 12.0]",2,VLEETSVML,260,10.0
2143,1.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[TPSVSSSISSL B0702, RVRAYTYSK A0301, VLEETSVML...","[2.0, 5.0, 26.0]",2,VLEETSVML,260,10.0
3570,1.0,9.0,[B0702],TPRVTGGGAM B0702,"[VLEETSVML A0201, TPRVTGGGAM B0702]","[13.0, 22.0]",1,TPRVTGGGAM,259,9.0
4308,1.0,2.0,[A0201],VLEETSVML A0201,"[TPRVTGGGAM B0702, RVRAYTYSK A0301, VLEETSVML ...","[2.0, 5.0, 9.0]",2,VLEETSVML,252,2.0


In [63]:
lol.loc[lol.peptide != lol.top_cluster_peptide,
        ['ct','sample_id','HLA_cd8','peptide_HLA','peptide_HLA_lst','umi_count_lst_mhc','clusters','top_cluster_peptide','top_cluster_peptide_hsh']].head(60)

Unnamed: 0,ct,sample_id,HLA_cd8,peptide_HLA,peptide_HLA_lst,umi_count_lst_mhc,clusters,top_cluster_peptide,top_cluster_peptide_hsh
654,1.0,9.0,[B0702],TPRVTGGGAM B0702,"[RVRAYTYSK A0301, VLEETSVML A0201, TPRVTGGGAM ...","[2.0, 18.0, 19.0]",2,VLEETSVML,259
1965,2.0,10.0,"[A0201, A0301]",RVRAYTYSK A0301,"[YVLDHLIVV A0201, TPRVTGGGAM B0702, RVRAYTYSK ...","[1.0, 1.0, 2.0]",3,YVLDHLIVV,260
2155,5316.0,10.0,"[A0201, A0301]",NLVPMVATV A0201,"[CLGGLLTMV A0201, RVRAYTYSK A0301, NLVPMVATV A...","[2.0, 21.0, 26.0]",12,RVRAYTYSK,260
2441,2125.0,7.0,[B0702],YVLDHLIVV A0201,"[RPHERNGFTV B0702, RVRAYTYSK A0301, TPSVSSSISS...","[1.0, 2.0, 3.0, 12.0, 15.0]",10,RPHERNGFTVL,257
3452,364.0,6.0,"[A0201, B0702]",RVRAYTYSK A0301,"[CLGGLLTMV A0201, TPRVTGGGAM B0702, RPHERNGFTV...","[1.0, 1.0, 13.0, 17.0]",10,RPHERNGFTVL,256
3543,19.0,7.0,[B0702],RVRAYTYSK A0301,"[RPHERNGFTV B0702, RPHERNGFTVL B0702, RVRAYTYS...","[1.0, 8.0, 9.0]",10,RPHERNGFTVL,257
3565,5.0,10.0,"[A0201, A0301]",VLEETSVML A0201,"[YVLDHLIVV A0201, RVRAYTYSK A0301, VLEETSVML A...","[2.0, 6.0, 7.0]",12,RVRAYTYSK,260
3827,828.0,8.0,[B0702],VLEETSVML A0201,"[RVRAYTYSK A0301, TPRVTGGGAM B0702, RPHERNGFTV...","[1.0, 1.0, 5.0, 6.0]",10,RPHERNGFTVL,258
4789,67.0,10.0,"[A0201, A0301]",RVRAYTYSK A0301,"[YVLDHLIVV A0201, RVRAYTYSK A0301]","[1.0, 2.0]",3,YVLDHLIVV,260
5358,3154.0,10.0,"[A0201, A0301]",GILGFVFTL A0201,"[RPHERNGFTV B0702, RVRAYTYSK A0301, GILGFVFTL ...","[1.0, 7.0, 8.0]",12,RVRAYTYSK,260


In [64]:
lol.rename(columns={'peptide_HLA':'old_peptide_HLA','top_cluster_peptide':'peptide_HLA'}, inplace=True)

In [65]:
lol = calc_binding_concordance(lol, 'ct')

In [66]:
lol.loc[lol.binding_concordance < 0.5,
        ['ct','ct_pep','sample_id','HLA_cd8','old_peptide_HLA','peptide_HLA_lst','umi_count_lst_mhc','clusters',
         'peptide_HLA','top_cluster_peptide_hsh']].sort_values(by='ct').head(60)

Unnamed: 0,ct,ct_pep,sample_id,HLA_cd8,old_peptide_HLA,peptide_HLA_lst,umi_count_lst_mhc,clusters,peptide_HLA,top_cluster_peptide_hsh
5028,1.0,VLEETSVML A0201,1.0,[A0201],YVLDHLIVV A0201,"[TPSVSSSISSL B0702, RVRAYTYSK A0301, YVLDHLIVV...","[1.0, 4.0, 19.0]",3,YVLDHLIVV,251
5626,1.0,VLEETSVML A0201,10.0,"[A0201, A0301]",YVLDHLIVV A0201,"[TPRVTGGGAM B0702, VLEETSVML A0201, YVLDHLIVV ...","[3.0, 23.0, 76.0]",3,YVLDHLIVV,260
5726,1.0,VLEETSVML A0201,10.0,"[A0201, A0301]",RVRAYTYSK A0301,[RVRAYTYSK A0301],[2.0],0,RVRAYTYSK,260
6137,1.0,VLEETSVML A0201,2.0,[A0201],RVRAYTYSK A0301,"[VLEETSVML A0201, RPHERNGFTVL B0702, RVRAYTYSK...","[1.0, 1.0, 3.0]",12,RVRAYTYSK,252
1375,1.0,VLEETSVML A0201,10.0,"[A0201, A0301]",RPPIFIRRL B0702,"[VLEETSVML A0201, RVRAYTYSK A0301, RPPIFIRRL B...","[1.0, 1.0, 2.0]",7,RPPIFIRRL,260
4813,1.0,VLEETSVML A0201,9.0,[B0702],TPRVTGGGAM B0702,"[VLEETSVML A0201, RVRAYTYSK A0301, TPRVTGGGAM ...","[1.0, 2.0, 11.0]",1,TPRVTGGGAM,259
733,1.0,VLEETSVML A0201,10.0,"[A0201, A0301]",RVRAYTYSK A0301,"[RPHERNGFTV B0702, TPRVTGGGAM B0702, VLEETSVML...","[1.0, 2.0, 30.0, 45.0]",12,RVRAYTYSK,260
5575,1.0,VLEETSVML A0201,7.0,[B0702],TPRVTGGGAM B0702,"[RVRAYTYSK A0301, TPRVTGGGAM B0702]","[1.0, 3.0]",1,TPRVTGGGAM,257
6752,1.0,VLEETSVML A0201,10.0,"[A0201, A0301]",YVLDHLIVV A0201,"[CLGGLLTMV A0201, RVRAYTYSK A0301, VLEETSVML A...","[1.0, 1.0, 9.0, 10.0]",3,YVLDHLIVV,260
6793,1.0,VLEETSVML A0201,10.0,"[A0201, A0301]",RVRAYTYSK A0301,[RVRAYTYSK A0301],[2.0],0,RVRAYTYSK,260
