In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from ast import literal_eval
import seaborn as sns

In [2]:
import sys  
sys.path.insert(0, '../scripts')

from D_plot_specificity_matrix_utils import (peptide_per_clonotype_by_gem_size,
                                             multiple_peptides_per_gem_w_filtering,
                                             calc_binding_concordance,
                                             epitope_sorter_index,
                                             peptides_per_gem)

In [3]:
sns.set_style('ticks', {'axes.edgecolor': '0',  
                        'xtick.color': '0',
                        'ytick.color': '0'})

In [4]:
def HLA_cd8_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace(",", "").replace("'","").split(" ")

def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def epitope_converter(x):
    #define format of datetime
    return [y for y in x.replace("[","").replace("]","").replace("\n","").split("'") if (y != '') & (y != ' ')]

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("\n","").replace("'",""))

def literal_converter(val):
    # replace NaN with '' and perform literal eval on the rest
    return [] if val == '' else literal_eval(val)

converters = {'peptide_HLA_lst': peptide_hla_converter,
              'umi_count_lst_mhc': literal_eval,
              'umi_count_lst_TRA': literal_converter,'umi_count_lst_TRB': literal_converter,
              'cdr3_lst_TRA': cdr3_lst_converter,
              'cdr3_lst_TRB': cdr3_lst_converter,
              'HLA_lst_mhc': cdr3_lst_converter,'HLA_cd8': HLA_cd8_converter} #

In [5]:
def notnan(x):
    return x == x

In [6]:
def get_multiplets(df):
    #tmp = df[idx1 & idx2]
    dct = df.groupby(['ct','peptide_HLA']).gem.count() > 1
    idx = df.set_index(['ct','peptide_HLA']).index.map(dct)
    return idx.fillna(False)

# Input

In [7]:
CLONOTYPES = '../experiments/exp13/run3/tcr/cellranger_tot/outs/per_sample_outs/cellranger_tot/vdj_t/consensus_annotations.csv'

In [8]:
clone_df = pd.read_csv(CLONOTYPES)

In [9]:
clone_df

Unnamed: 0,clonotype_id,consensus_id,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,...,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,cdr3_start,cdr3_end,fwr4_start,fwr4_end
0,clonotype1,clonotype1_consensus1,499,TRB,TRBV3-1,,TRBJ1-5,TRBC1,True,True,...,197,248,248,266,266,374,374,419,419,447
1,clonotype1,clonotype1_consensus2,613,TRA,TRAV30,,TRAJ38,TRAC,True,True,...,327,378,378,399,399,498,498,540,540,571
2,clonotype2,clonotype2_consensus1,576,TRB,TRBV7-9,,TRBJ1-1,TRBC1,True,True,...,265,316,316,334,334,445,445,496,496,524
3,clonotype2,clonotype2_consensus2,526,TRA,TRAV14/DV4,,TRAJ12,TRAC,True,True,...,234,285,285,309,309,408,408,453,453,484
4,clonotype3,clonotype3_consensus1,764,TRB,TRBV5-1,,TRBJ2-5,TRBC2,True,True,...,459,510,510,528,528,636,636,684,684,712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4003,clonotype2268,clonotype2268_consensus2,474,TRA,TRAV38-1,,TRAJ20,TRAC,True,True,...,188,239,239,263,263,362,362,401,401,432
4004,clonotype2269,clonotype2269_consensus1,528,TRB,TRBV13,TRBD1,TRBJ2-4,TRBC2,True,True,...,223,274,274,292,292,400,400,448,448,476
4005,clonotype2269,clonotype2269_consensus2,526,TRA,TRAV14/DV4,,TRAJ21,TRAC,True,True,...,234,285,285,309,309,408,408,453,453,484
4006,clonotype2270,clonotype2270_consensus1,757,TRB,TRBV13,,TRBJ2-4,TRBC2,True,True,...,443,494,494,512,512,620,620,677,677,705


In [10]:
VALID = '../experiments/exp13/run3/cat/eval_clonotypes/valid_ct.csv'
#OS2 = '../experiments/exp13/run2/cat/eval_clonotypes/valid_ct.csv'

# Load

In [11]:
df = pd.read_csv(VALID, converters=converters)

In [12]:
df.fillna({'umi_count_mhc':0, 'delta_umi_mhc':0, 'umi_count_mhc_rel':0,
           'umi_count_cd8':0, 'delta_umi_cd8':0,
           'umi_count_TRA':0, 'delta_umi_TRA':0,
           'umi_count_TRB':0, 'delta_umi_TRB':0,
           'cdr3_TRA':'','cdr3_TRB':''}, inplace=True)

# Clone df

In [13]:
clone_df['genes'] = clone_df.replace([None], ['']).fillna('').apply(lambda x: ';'.join(x[['v_gene','j_gene','cdr3']]), axis=1)

In [14]:
clone_df.loc[clone_df.clonotype_id.isin(['clonotype9','clonotype5','clonotype29','clonotype191','clonotype4525']),
             ['clonotype_id','length','chain','full_length','productive','umis','genes']]

Unnamed: 0,clonotype_id,length,chain,full_length,productive,umis,genes
8,clonotype5,463,TRB,True,True,866,TRBV28;TRBJ2-7;CASSYVGYEQYF
9,clonotype5,668,TRA,True,True,115,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF
16,clonotype9,576,TRB,True,True,534,TRBV7-9;TRBJ1-6;CASSTHDSEGALSPLHF
17,clonotype9,621,TRA,True,True,223,TRAV3;TRAJ12;CAVRARMDSSYKLIF
55,clonotype29,835,TRB,True,True,165,TRBV5-1;TRBJ2-1;CASSATNEQFF
56,clonotype29,577,TRA,True,True,14,TRAV8-1;TRAJ39;CAIDGGDNAGNMLTF
409,clonotype191,496,TRB,True,True,10,TRBV12-4;TRBJ1-1;CARGREAEAFF
410,clonotype191,574,TRA,True,True,2,TRAV19;TRAJ17;CALSEAQFGAAGNKLTF


In [15]:
clone_a = clone_df[clone_df.chain == 'TRA'].copy()
clone_b = clone_df[clone_df.chain == 'TRB'].copy()

# Overview of clonotypes and their genes

In [19]:
# df index: clonotypes, 2 columns: genes_TRA & genes_TRB
clone1 = pd.merge(clone_a.groupby('clonotype_id').genes.unique().to_frame(),
                   clone_b.groupby('clonotype_id').genes.unique().to_frame(),
                   how='outer', left_index=True, right_index=True, suffixes=['_TRA','_TRB'])
clone1

Unnamed: 0_level_0,genes_TRA,genes_TRB
clonotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1
clonotype1,[TRAV30;TRAJ38;CGTEGAGNNRKLIW],[TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF]
clonotype10,[TRAV38-2/DV8;TRAJ54;CAYNQGAQKLVF],[TRBV9;TRBJ1-3;CASSYPTGGTSGNTIYF]
clonotype100,[TRAV17;TRAJ44;CATVASKLTF],[TRBV7-9;TRBJ2-1;CASTPTGLGVDEQFF]
clonotype1000,,[TRBV27;TRBJ2-3;CASSFGPLTDTQYF]
clonotype1001,,[TRBV5-6;TRBJ1-2;CASSSLVMGYGYTF]
...,...,...
clonotype995,,[TRBV27;TRBJ2-7;CASSFSTSSYEQYF]
clonotype996,,[TRBV27;TRBJ2-5;CASSPYRSGETQYF]
clonotype997,,[TRBV27;TRBJ1-4;CASSLRGSGEKLFF]
clonotype998,,[TRBV27;TRBJ2-2;CASSLGGSPGELFF]


In [20]:
clone1 = clone1.explode('genes_TRA').explode('genes_TRB')

In [22]:
clone1['chains'] = clone1.genes_TRA.fillna('') + '|' + clone1.genes_TRB.fillna('')

In [23]:
clone1['clonotype'] = clone1.index.str.strip('clonotype').astype(int)

In [24]:
clone1.sort_values(by='clonotype', inplace=True)

# Only on most abundant chain

In [25]:
df.num_clonotype = df.num_clonotype.astype(int)

In [27]:
df['chain_a'] = df.v_gene_TRA.fillna('') + ";" + df.j_gene_TRA.fillna('') + ";" + df.cdr3_TRA.fillna('')
df['chain_b'] = df.v_gene_TRB.fillna('') + ";" + df.j_gene_TRB.fillna('') + ";" + df.cdr3_TRB.fillna('')
df['chains'] = df.chain_a + '|' + df.chain_b

In [28]:
def get_alpha_pairs(row):
    return ((c10.chain_a.isin([row.chain_a]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_a.isin([row.chain_a])))])

In [29]:
def get_beta_pairs(row):
    return ((c10.chain_b.isin([row.chain_b]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_b.isin([row.chain_b])))])

In [30]:
def get_pairs(row):
    return ((c10.chains.isin([row.chains]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chains.isin([row.chains])))])

In [32]:
def get_pairs(row, ref, var='chains', impute=False):
    """ref=c10"""
    # Given data only
    matches = ref[ref[var].isin([row[var]])].clonotype.to_list()
    if (var == 'chains') & (len(matches) == 1) & impute:
        # Only unique matches when imputing c0 with missing chain
        return matches[0]
    elif (var == 'chains') & (len(matches) > 1): # reversed the expression (len(matches) == 1)
        # Len can be one when removing replicas from the reference set
        return matches # returns a list
    elif (var != 'chains') & (len(matches) == 1):
        # we are only interested in the unique matches
        return matches[0] # returns the match
    return np.nan
    #return ((c10.chains.isin([row.chains]) &
    #         (c10.clonotype != row.clonotype)).astype(int).to_list())

### Merge c10 both chains

In [33]:
# Conversion table of clonotypes.
# Keys: "old" clonotype ID
# Vals: convert old cID to this clonotype ID
clonotype_dct = dict()

In [37]:
tmp = df[df.num_clonotype!=0].copy()
tmp.dropna(subset=['genes_TRA','genes_TRB'], inplace=True)
c10 = tmp.groupby(['num_clonotype','chain_a','chain_b','chains']).gem.size().reset_index()
c10.drop_duplicates(subset='num_clonotype', keep='last', inplace=True)
c10.rename(columns={'num_clonotype':'clonotype'}, inplace=True)
c10

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252
1,2,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,TRBV7-9;TRBJ1-1;CASSSHDRTGVRTEAFF,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|TRBV7-9;TRBJ...,171
2,3,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF,TRBV5-1;TRBJ2-5;CASSTPSSGPQETQYF,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF|TRBV5-1;TRBJ2-5...,156
3,4,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF,TRBV12-4;TRBJ2-1;CASTTGTSGRDYNEQFF,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF|TRBV12-4;TRBJ2-1...,124
4,5,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF,TRBV28;TRBJ2-7;CASSYVGYEQYF,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF|TRBV28;TRBJ2-7...,45
...,...,...,...,...,...
1474,2266,TRAV12-2;TRAJ30;CAVMNRDDKIIF,TRBV13;TRBJ1-2;CASSFYRTTSNYGYTF,TRAV12-2;TRAJ30;CAVMNRDDKIIF|TRBV13;TRBJ1-2;CA...,1
1475,2267,TRAV13-2;TRAJ41;CAENSNSGYALNF,TRBV13;TRBJ1-5;CASSLGTGTGNQPQHF,TRAV13-2;TRAJ41;CAENSNSGYALNF|TRBV13;TRBJ1-5;C...,1
1476,2268,TRAV38-1;TRAJ20;CAFRGPNDYKLSF,TRBV13;TRBJ1-6;CASSPKGTDGNSPLHF,TRAV38-1;TRAJ20;CAFRGPNDYKLSF|TRBV13;TRBJ1-6;C...,1
1477,2269,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF,TRBV13;TRBJ2-4;CASSTGGAAAKNIQYF,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF|TRBV13;TRBJ2...,1


### Merge 10x clonotypes

In [40]:
query = clone1.sort_values(by='clonotype').dropna(subset=['genes_TRA','genes_TRB']).copy()
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=clone1), axis=1)
dct = query.dropna().drop_duplicates(subset='chains').set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
dct = {i:x for x,y in dct.items() for i in y}
clonotype_dct.update(dct)
query.dropna(subset=['pairs']).head()

Unnamed: 0_level_0,genes_TRA,genes_TRB,chains,clonotype,pairs
clonotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
clonotype33,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,33,"[33, 122, 212, 906, 911, 915]"
clonotype46,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...,46,"[46, 1095]"
clonotype46,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,46,"[46, 1095]"
clonotype61,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPARNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,61,"[61, 89, 908, 909]"
clonotype62,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPQRNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,62,"[62, 914]"


In [43]:
query = df.loc[df.num_clonotype!=0, 
               ['gem','num_clonotype','chain_a','chain_b','chains','genes_TRA','genes_TRB']].copy()
query.dropna(subset=['genes_TRA','genes_TRB'], inplace=True)
query.rename(columns={'num_clonotype':'clonotype'}, inplace=True)
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=clone1), axis=1)
query.dropna().head()

Unnamed: 0,gem,clonotype,chain_a,chain_b,chains,genes_TRA,genes_TRB,pairs
138,AAGCCGCGTCACCTAA-1,61,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPARNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,TRAV23/DV6;TRAJ48;TRAC,TRBV4-3;;TRBJ1-1;TRBC1,"[61, 89, 908, 909]"
168,AAGGTTCGTCAGAGGT-1,62,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPQRNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,TRAV23/DV6;TRAJ48;TRAC,TRBV4-3;;TRBJ1-1;TRBC1,"[62, 914]"
279,ACATGGTCAGGCAGTA-1,910,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPNRNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,TRAV23/DV6;TRAJ48;TRAC,TRBV4-3;;TRBJ1-1;TRBC1,"[175, 910, 912]"
483,ACTTTCAGTAGGGTAC-1,1133,TRAV38-2/DV8;TRAJ28;CAAGSYQLTF,TRBV7-9;TRBJ2-1;CASSGDSETYNEQFF,TRAV38-2/DV8;TRAJ28;CAAGSYQLTF|TRBV7-9;TRBJ2-1...,TRAV38-2/DV8;TRAJ28;TRAC,TRBV7-9;;TRBJ2-1;TRBC2,"[1133, 1780]"
558,AGCAGCCTCCTAGTGA-1,581,TRAV12-1;TRAJ49;CVVNMGGNQFYF,TRBV30;TRBJ2-5;CAWKPPGDQETQYF,TRAV12-1;TRAJ49;CVVNMGGNQFYF|TRBV30;TRBJ2-5;CA...,TRAV12-1;TRAJ49;TRAC,TRBV30;;TRBJ2-5;TRBC2,"[581, 582]"


In [44]:
#query = c10.copy()
#query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c10), axis=1)
#dct = query.dropna().drop_duplicates(subset='chains').set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
#dct = {i:x for x,y in dct.items() for i in y}
#clonotype_dct.update(dct)
#query.dropna(subset=['pairs'])

In [45]:
#c10['pairs'] = c10.apply(lambda row: get_pairs(row), axis=1)

In [46]:
#c10['keep'] = c10.dropna(subset=['pairs']).apply(lambda row: row.clonotype == row.pairs[0], axis=1)

In [33]:
# Listing the duplicates
#clonotype_dct = c10[c10.keep == True].set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
# alternative to keep
#query[query.duplicated(subset=['chains'])].set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
#query.dropna().drop_duplicates(subset='chains').set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()

In [47]:
c10.drop_duplicates(subset=['chains'], inplace=True) # New updated reference

In [48]:
c10

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252
1,2,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,TRBV7-9;TRBJ1-1;CASSSHDRTGVRTEAFF,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|TRBV7-9;TRBJ...,171
2,3,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF,TRBV5-1;TRBJ2-5;CASSTPSSGPQETQYF,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF|TRBV5-1;TRBJ2-5...,156
3,4,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF,TRBV12-4;TRBJ2-1;CASTTGTSGRDYNEQFF,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF|TRBV12-4;TRBJ2-1...,124
4,5,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF,TRBV28;TRBJ2-7;CASSYVGYEQYF,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF|TRBV28;TRBJ2-7...,45
...,...,...,...,...,...
1474,2266,TRAV12-2;TRAJ30;CAVMNRDDKIIF,TRBV13;TRBJ1-2;CASSFYRTTSNYGYTF,TRAV12-2;TRAJ30;CAVMNRDDKIIF|TRBV13;TRBJ1-2;CA...,1
1475,2267,TRAV13-2;TRAJ41;CAENSNSGYALNF,TRBV13;TRBJ1-5;CASSLGTGTGNQPQHF,TRAV13-2;TRAJ41;CAENSNSGYALNF|TRBV13;TRBJ1-5;C...,1
1476,2268,TRAV38-1;TRAJ20;CAFRGPNDYKLSF,TRBV13;TRBJ1-6;CASSPKGTDGNSPLHF,TRAV38-1;TRAJ20;CAFRGPNDYKLSF|TRBV13;TRBJ1-6;C...,1
1477,2269,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF,TRBV13;TRBJ2-4;CASSTGGAAAKNIQYF,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF|TRBV13;TRBJ2...,1


### Impute c10 alpha
Only unique matches, i.e. only one match in the reference clonotypes!

In [51]:
query = df[(df.num_clonotype!=0) & df.genes_TRB.isna() & ~df.num_clonotype.isin(c10.clonotype)].copy()
query.dropna(subset=['genes_TRA'], inplace=True)
query = query.groupby(['num_clonotype','chain_a','chain_b','chains']).size().reset_index()
query.rename(columns={'num_clonotype':'clonotype',0:'gem'}, inplace=True)
#query = df[(df.num_clonotype!=0) & df.genes_TRB.isna() & ~df.num_clonotype.isin(c10.clonotype)].dropna(subset=['genes_TRA']).groupby(['num_clonotype','chain_a','chain_b','chains']).size().reset_index().rename(columns={'num_clonotype':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c10, var='chain_a'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_dct.update(dct)
query.dropna().head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
6,393,TRAV26-1;TRAJ20;CIAPDLYDYKLSF,;;,TRAV26-1;TRAJ20;CIAPDLYDYKLSF|;;,1,322.0
15,537,TRAV21;TRAJ26;CAVINYGQNFVF,;;,TRAV21;TRAJ26;CAVINYGQNFVF|;;,1,24.0
16,538,TRAV21;TRAJ26;CAVINYGQNFVF,;;,TRAV21;TRAJ26;CAVINYGQNFVF|;;,1,24.0
17,539,TRAV21;TRAJ26;CAVINYGQNFVF,;;,TRAV21;TRAJ26;CAVINYGQNFVF|;;,1,24.0
19,541,TRAV30;TRAJ47;CGTGEYGNKLVF,;;,TRAV30;TRAJ47;CGTGEYGNKLVF|;;,1,169.0


### Impute c10 beta
Only unique matches, i.e. only one match in the reference clonotypes!

In [54]:
query = df[(df.num_clonotype!=0) & df.genes_TRA.isna() & ~df.num_clonotype.isin(c10.clonotype)].copy()
query.dropna(subset=['genes_TRB'], inplace=True)
query = query.groupby(['num_clonotype','chain_a','chain_b','chains']).size().reset_index()
query.rename(columns={'num_clonotype':'clonotype',0:'gem'}, inplace=True)
#query = df[(df.num_clonotype!=0) & df.genes_TRA.isna() & ~df.num_clonotype.isin(c10.clonotype)].dropna(subset=['genes_TRB']).groupby(['num_clonotype','chain_a','chain_b','chains']).size().reset_index().rename(columns={'num_clonotype':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c10, var='chain_b'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_dct.update(dct)
query.dropna().head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
3,122,;;,TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,;;|TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,1,33.0
6,212,;;,TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,;;|TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,1,33.0
42,516,;;,TRBV6-2;TRBJ2-1;CASSWDHNEQFF,;;|TRBV6-2;TRBJ2-1;CASSWDHNEQFF,1,120.0
43,517,;;,TRBV6-2;TRBJ2-1;CASSWDHNEQFF,;;|TRBV6-2;TRBJ2-1;CASSWDHNEQFF,1,120.0
44,518,;;,TRBV6-2;TRBJ2-1;CASSWDHNEQFF,;;|TRBV6-2;TRBJ2-1;CASSWDHNEQFF,1,120.0


# Now impute clonotype None

In [55]:
# Convertion table
# GEM to imputed clonotype ID
clonotype_nll = dict()

### Impute c0 beta from 10x

In [57]:
query = df[(df.num_clonotype==0) & df.genes_TRA.isna() & ~df.num_clonotype.isin(c10.clonotype)].copy()
query.dropna(subset=['genes_TRB'], inplace=True)
query = query.groupby(['gem','chain_a','chain_b','chains']).size().reset_index()
query.rename(columns={'gem':'clonotype',0:'gem'}, inplace=True)
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c10, var='chain_b'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_nll.update(dct)
query.head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AAACCTGAGCTGTTCA-1,;;,TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,;;|TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,1,19.0
1,AAACCTGTCGCTTAGA-1,;;,TRBV28;TRBJ2-7;CASSYVGYEQYF,;;|TRBV28;TRBJ2-7;CASSYVGYEQYF,1,5.0
2,AAACGGGGTACAGTGG-1,;;,TRBV27;TRBJ1-3;CASSSDIYSGNTIYF,;;|TRBV27;TRBJ1-3;CASSSDIYSGNTIYF,1,
3,AAACGGGGTCCGTGAC-1,;;,TRBV7-9;TRBJ1-6;CASSSHDWGGQGSPLHF,;;|TRBV7-9;TRBJ1-6;CASSSHDWGGQGSPLHF,1,40.0
4,AAACGGGGTCGACTGC-1,;;,TRBV6-1;TRBJ2-5;CASNHEYQETQYF,;;|TRBV6-1;TRBJ2-5;CASNHEYQETQYF,1,


### Impute c0 alpha from 10x

In [59]:
query = df[(df.num_clonotype==0) & df.genes_TRB.isna() & ~df.num_clonotype.isin(c10.clonotype)].copy()
query.dropna(subset=['genes_TRA'], inplace=True)
query = query.groupby(['gem','chain_a','chain_b','chains']).size().reset_index()
query.rename(columns={'gem':'clonotype',0:'gem'}, inplace=True)
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c10, var='chain_a'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_nll.update(dct)
query

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AAACGGGCAGGCGATA-1,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,;;,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|;;,1,2.0
1,AAACGGGGTCATGCCG-1,TRAV29/DV5;TRAJ57;CAAKTQGGSEKLVF,;;,TRAV29/DV5;TRAJ57;CAAKTQGGSEKLVF|;;,1,
2,AAAGATGCATCATCCC-1,TRAV12-2;TRAJ47;CAVNSISGYGNKLVF,;;,TRAV12-2;TRAJ47;CAVNSISGYGNKLVF|;;,1,
3,AACACGTTCCATGAAC-1,TRAV2;TRAJ31;CAVEDYAARLMF,;;,TRAV2;TRAJ31;CAVEDYAARLMF|;;,1,
4,AACTCAGCAAGCCGCT-1,TRAV17;TRAJ58;CATTPETSGSRLTF,;;,TRAV17;TRAJ58;CATTPETSGSRLTF|;;,1,
...,...,...,...,...,...,...
295,TTTATGCCAAGCTGGA-1,TRAV23/DV6;TRAJ48;CAASIGSFGNEKLTF,;;,TRAV23/DV6;TRAJ48;CAASIGSFGNEKLTF|;;,1,
296,TTTGCGCAGTGCTGCC-1,TRAV19;TRAJ33;CALSEAGSNYQLIW,;;,TRAV19;TRAJ33;CALSEAGSNYQLIW|;;,1,42.0
297,TTTGTCAAGGGTCTCC-1,TRAV12-1;TRAJ29;CYPDTPLVF,;;,TRAV12-1;TRAJ29;CYPDTPLVF|;;,1,
298,TTTGTCATCAAACCAC-1,TRAV21;TRAJ26;CAVKATNYGQNFVF,;;,TRAV21;TRAJ26;CAVKATNYGQNFVF|;;,1,12.0


### Impute c0 both chains from 10x

In [61]:
query = df[(df.num_clonotype==0) & ~df.num_clonotype.isin(c10.clonotype)].copy()
query.dropna(subset=['genes_TRA','genes_TRB'], inplace=True)
query = query.groupby(['gem','chain_a','chain_b','chains']).size().reset_index()
query.rename(columns={'gem':'clonotype',0:'gem'}, inplace=True)
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c10, impute=True), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_nll.update(dct)
query.head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AACACGTAGTGGTAGC-1,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF,TRBV5-1;TRBJ2-1;CASSATNEQFF,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF|TRBV5-1;TRBJ2-...,1,
1,AACTCCCGTGATAAAC-1,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF,TRBV11-3;TRBJ2-1;CASSLGPYNEQFF,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF|TRBV11-3;TRB...,1,
2,AACTCTTCAGGTTTCA-1,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,TRBV7-9;TRBJ1-1;CASSSHDRTGVRTEAFF,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|TRBV7-9;TRBJ...,1,2.0
3,AACTCTTTCCACGAAT-1,TRAV22;TRAJ16;CAVLTRFSDGQKLLF,TRBV11-3;TRBJ1-2;CASSLDPGGYGYTF,TRAV22;TRAJ16;CAVLTRFSDGQKLLF|TRBV11-3;TRBJ1-2...,1,
4,AACTGGTGTAGCGTCC-1,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF,TRBV7-8;TRBJ2-1;CASTLSSGLAGGRGNEQFF,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF|TRBV7-8;TRB...,1,


# Remaining Null clonotypes

In [62]:
# Dictionary of all GEMs that contain TCR replicas
# Key GEM that matches with value GEM
# Later replace the key values with a novel clonotype and inverse the dict?
gem_nll = dict()

### Merge c0 both

In [64]:
c00 = df[(df.num_clonotype==0) & ~df.gem.isin(clonotype_nll.keys())].copy()
c00.dropna(subset=['genes_TRA','genes_TRB'], inplace=True)
c00 = c00.groupby(['gem','chain_a','chain_b','chains']).size().reset_index()
c00.rename(columns={'gem':'clonotype',0:'gem'}, inplace=True)
query = c00.copy()
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c00, impute=False), axis=1)
#c00 = query.copy()
dct = query.dropna().set_index('clonotype').pairs.to_dict()
gem_nll.update(dct)
query.head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AACACGTAGTGGTAGC-1,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF,TRBV5-1;TRBJ2-1;CASSATNEQFF,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF|TRBV5-1;TRBJ2-...,1,
1,AACTCCCGTGATAAAC-1,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF,TRBV11-3;TRBJ2-1;CASSLGPYNEQFF,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF|TRBV11-3;TRB...,1,
2,AACTCTTTCCACGAAT-1,TRAV22;TRAJ16;CAVLTRFSDGQKLLF,TRBV11-3;TRBJ1-2;CASSLDPGGYGYTF,TRAV22;TRAJ16;CAVLTRFSDGQKLLF|TRBV11-3;TRBJ1-2...,1,
3,AACTGGTGTAGCGTCC-1,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF,TRBV7-8;TRBJ2-1;CASTLSSGLAGGRGNEQFF,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF|TRBV7-8;TRB...,1,
4,AAGCCGCTCGCCTGAG-1,TRAV1-2;TRAJ33;CAVRDSNYQLIW,TRBV5-1;TRBJ2-7;CASSLEGQASSYEQYF,TRAV1-2;TRAJ33;CAVRDSNYQLIW|TRBV5-1;TRBJ2-7;CA...,1,


### Merge c0 both chains

In [65]:
# Same same but different..
#lst = query[query.pairs.isna() & query.chains.duplicated(keep=False)].sort_values(by='chains').groupby('chains').clonotype.apply(list).values
#dct = {x[0]:x for x in lst}
##gem_nll.update(dct)
#dct

### Impute c0 beta
Find complete c0 TCRs that match on the beta chain

In [66]:
query = df[(df.num_clonotype==0) & df.genes_TRA.isna() & ~df.gem.isin(clonotype_nll.keys())].copy()
query.dropna(subset=['genes_TRB'], inplace=True)
query = query.groupby(['gem','chain_a','chain_b','chains']).size().reset_index()
query.rename(columns={'gem':'clonotype',0:'gem'}, inplace=True)
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c00, var='chain_b'), axis=1)
#dct = query.dropna().set_index('clonotype').pairs.to_dict()
lst = query.dropna().apply(lambda row: [row.clonotype, row.pairs], axis=1).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)
query.dropna().head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
5,AACACGTCACCTATCC-1,;;,TRBV28;TRBJ2-7;CASSLNIGSSYEQYF,;;|TRBV28;TRBJ2-7;CASSLNIGSSYEQYF,1,CTCGAGGTCAGCACAT-1
26,ACCAGTATCAGTTAGC-1,;;,TRBV29-1;TRBJ1-2;CSVEGYWPGAGYGYTF,;;|TRBV29-1;TRBJ1-2;CSVEGYWPGAGYGYTF,1,CCTAAAGGTCCGTTAA-1
27,ACCCACTGTCACTGGC-1,;;,TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,;;|TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,1,CCACGGAAGGCGTACA-1
44,ACTTACTGTATCTGCA-1,;;,TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,;;|TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,1,CCACGGAAGGCGTACA-1
58,AGGGATGAGTCATGCT-1,;;,TRBV7-9;TRBJ2-7;CASREGLQYEQYF,;;|TRBV7-9;TRBJ2-7;CASREGLQYEQYF,1,CTGTTTAAGAACTCGG-1


### Merge c0 beta

In [67]:
lst = query[query.pairs.isna() & query.chain_b.duplicated(keep=False)].copy()
lst.sort_values(by='chain_b', inplace=True)
lst = lst.groupby('chain_b').clonotype.apply(list).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)

### Impute c0 alpha

In [68]:
query = df[(df.num_clonotype==0) & df.genes_TRB.isna() & ~df.gem.isin(clonotype_nll.keys())].copy()
query.dropna(subset=['genes_TRA'], inplace=True)
query = query.groupby(['gem','chain_a','chain_b','chains']).size().reset_index()
query.rename(columns={'gem':'clonotype',0:'gem'}, inplace=True)
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c00, var='chain_a'), axis=1)
#dct = query.dropna().set_index('clonotype').pairs.to_dict()
lst = query.dropna().apply(lambda row: [row.clonotype, row.pairs], axis=1).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)

### merge replicas of c0 alpha

In [69]:
lst = query[query.pairs.isna() & query.chain_a.duplicated(keep=False)].copy()
lst.sort_values(by='chain_a', inplace=True)
lst = lst.groupby('chain_a').clonotype.apply(list).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)

# Comparing PAIRS vs single chain

In [117]:
c10['pairs'] = c10.apply(lambda row: (c10.chains.isin([row.chains]) &
                                      (c10.clonotype != row.clonotype)).values.astype(int), axis=1) 

In [91]:
#c10['pairs'] = c10.apply(lambda row: c10.loc[c10.chain_a.isin([row.chain_a]) &
#                                             (c10.clonotype != row.clonotype), 'clonotype'].to_list(), axis=1)

In [75]:
#c10['clones'] = c10.apply(lambda row: c10.clonotype.to_list() + ['c0'], axis=1)
c10['clones'] = c10.apply(lambda row: c10.clonotype.to_list(), axis=1)

In [81]:
tmp = pd.DataFrame(columns=['clonotype','pairs','clones'])

In [82]:
tmp['clonotype'] = ['c0']

In [83]:
# Count matches of clonotype 0 with the true clonotypes + itself
tmp['pairs'] =  [c10.chains.isin(c00.chains).astype(int).to_list() + [0]]
#[c10.chain_b.isin(c00.chain_b).astype(int).to_list() + [0]]

In [84]:
# List the true clonotypes
tmp['clones'] = [c10.clonotype.to_list() + ['c0']]

In [119]:
c = pd.concat([c10,tmp], ignore_index=True)

In [122]:
m = c.explode(['pairs','clones'])
m

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,clones,pairs
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252.0,1,0
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252.0,2,0
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252.0,3,0
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252.0,4,0
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252.0,5,0
...,...,...,...,...,...,...,...
1433,c0,,,,,2267,0
1433,c0,,,,,2268,0
1433,c0,,,,,2269,0
1433,c0,,,,,2270,0


In [124]:
m[(m.pairs > 0)]

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,clones,pairs


In [125]:
out = c.explode(['pairs','clones']).pivot(index='clonotype',columns='clones', values='pairs').replace(0, np.nan).reset_index().rename(columns={'clonotype':'from'})

In [316]:
#out.to_csv('arc_data.b.csv', index=False)

In [127]:
out.dropna(how='all')

clones,from,1,2,3,4,5,6,7,8,9,...,2257,2258,2259,2265,2266,2267,2268,2269,2270,c0
0,1,,,,,,,,,,...,,,,,,,,,,
1,2,,,,,,,,,,...,,,,,,,,,,
2,3,,,,,,,,,,...,,,,,,,,,,
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429,2267,,,,,,,,,,...,,,,,,,,,,
1430,2268,,,,,,,,,,...,,,,,,,,,,
1431,2269,,,,,,,,,,...,,,,,,,,,,
1432,2270,,,,,,,,,,...,,,,,,,,,,
