In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from ast import literal_eval
import seaborn as sns

In [2]:
import sys  
sys.path.insert(0, '../scripts')

from D_plot_specificity_matrix_utils import (peptide_per_clonotype_by_gem_size,
                                             multiple_peptides_per_gem_w_filtering,
                                             calc_binding_concordance,
                                             epitope_sorter_index,
                                             peptides_per_gem)

In [3]:
sns.set_style('ticks', {'axes.edgecolor': '0',  
                        'xtick.color': '0',
                        'ytick.color': '0'})

In [4]:
def HLA_cd8_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace(",", "").replace("'","").split(" ")

def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def epitope_converter(x):
    #define format of datetime
    return [y for y in x.replace("[","").replace("]","").replace("\n","").split("'") if (y != '') & (y != ' ')]

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("\n","").replace("'",""))

def literal_converter(val):
    # replace NaN with '' and perform literal eval on the rest
    return [] if val == '' else literal_eval(val)

converters = {'peptide_HLA_lst': peptide_hla_converter,
              'umi_count_lst_mhc': literal_eval,
              'umi_count_lst_TRA': literal_converter,'umi_count_lst_TRB': literal_converter,
              'cdr3_lst_TRA': cdr3_lst_converter,
              'cdr3_lst_TRB': cdr3_lst_converter,
              'HLA_lst_mhc': cdr3_lst_converter,'HLA_cd8': HLA_cd8_converter} #

In [5]:
def notnan(x):
    return x == x

In [6]:
def get_multiplets(df):
    #tmp = df[idx1 & idx2]
    dct = df.groupby(['ct','peptide_HLA']).gem.count() > 1
    idx = df.set_index(['ct','peptide_HLA']).index.map(dct)
    return idx.fillna(False)

# Input

In [112]:
CLONOTYPES = '../experiments/exp13/run1_archive/tcr/cellranger_tot/outs/per_sample_outs/cellranger_tot/vdj_t/consensus_annotations.csv'

In [113]:
clone_df = pd.read_csv(CLONOTYPES)

In [114]:
clone_df

Unnamed: 0,clonotype_id,consensus_id,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,...,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,cdr3_start,cdr3_end,fwr4_start,fwr4_end
0,clonotype1,clonotype1_consensus1,499,TRB,TRBV3-1,,TRBJ1-5,TRBC1,True,True,...,197,248,248,266,266,374,374,419,419,447
1,clonotype1,clonotype1_consensus2,613,TRA,TRAV30,,TRAJ38,TRAC,True,True,...,327,378,378,399,399,498,498,540,540,571
2,clonotype2,clonotype2_consensus1,463,TRB,TRBV28,,TRBJ2-7,TRBC2,True,True,...,170,221,221,239,239,347,347,383,383,411
3,clonotype2,clonotype2_consensus2,668,TRA,TRAV8-6,,TRAJ45,TRAC,True,True,...,373,424,424,448,448,547,547,595,595,626
4,clonotype3,clonotype3_consensus1,576,TRB,TRBV7-9,,TRBJ1-1,TRBC1,True,True,...,265,316,316,334,334,445,445,496,496,524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10441,clonotype6100,clonotype6100_consensus1,539,TRB,TRBV13,TRBD1,TRBJ1-2,TRBC1,True,True,...,225,276,276,294,294,402,402,459,459,487
10442,clonotype6100,clonotype6100_consensus2,645,TRA,TRAV8-6,,TRAJ39,TRAC,True,True,...,356,407,407,431,431,530,530,572,572,603
10443,clonotype6101,clonotype6101_consensus1,757,TRB,TRBV13,,TRBJ2-4,TRBC2,True,True,...,443,494,494,512,512,620,620,677,677,705
10444,clonotype6101,clonotype6101_consensus2,612,TRA,TRAV41,,TRAJ45,TRAC,True,True,...,320,371,371,386,386,485,485,539,539,570


In [7]:
VALID = '../experiments/exp13/run1_archive/cat/eval_clonotypes/valid_ct.csv'
#OS2 = '../experiments/exp13/run2/cat/eval_clonotypes/valid_ct.csv'

# Load

In [8]:
df = pd.read_csv(VALID, converters=converters)

In [9]:
df.fillna({'umi_count_mhc':0, 'delta_umi_mhc':0, 'umi_count_mhc_rel':0,
           'umi_count_cd8':0, 'delta_umi_cd8':0,
           'umi_count_TRA':0, 'delta_umi_TRA':0,
           'umi_count_TRB':0, 'delta_umi_TRB':0,
           'cdr3_TRA':'','cdr3_TRB':''}, inplace=True)

# Clone df

In [141]:
clone_df['genes'] = clone_df.replace([None], ['']).fillna('').apply(lambda x: ';'.join(x[['v_gene','j_gene','cdr3']]), axis=1)

In [142]:
clone_df.loc[clone_df.clonotype_id.isin(['clonotype9','clonotype5','clonotype29','clonotype191','clonotype4525']),
             ['clonotype_id','length','chain','full_length','productive','umis','genes']]

Unnamed: 0,clonotype_id,length,chain,full_length,productive,umis,genes
8,clonotype5,520,TRB,True,True,1281,TRBV12-4;TRBJ2-1;CASTTGTSGRDYNEQFF
9,clonotype5,751,TRA,True,True,417,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF
16,clonotype9,493,TRB,True,True,1270,TRBV7-2;TRBJ2-1;CASSFFSNEQFF
17,clonotype9,589,TRA,True,True,161,TRAV17;TRAJ35;CATVPGIGFGNVLHC
18,clonotype9,627,TRA,True,True,126,TRAV12-1;TRAJ29;CVVNMGTPLVF
58,clonotype29,565,TRB,True,True,374,TRBV7-9;TRBJ2-1;CASSLIGESGRNEQFF
59,clonotype29,589,TRA,True,True,99,TRAV17;TRAJ12;CATVNRMDSSYKLIF
417,clonotype191,485,TRB,True,True,9,TRBV27;TRBJ1-2;CASSLSYSTGNYGYTF
418,clonotype191,510,TRB,True,True,33,TRBV7-9;TRBJ2-7;CASSQNRDSRPYEQYF
419,clonotype191,506,TRA,True,True,7,TRAV38-2/DV8;TRAJ42;CAYRSTLNYGGSQGNLIF


In [143]:
clone_a = clone_df[clone_df.chain == 'TRA'].copy()
clone_b = clone_df[clone_df.chain == 'TRB'].copy()

In [144]:
# df index: clonotypes, 2 columns: genes_TRA & genes_TRB
clone1 = pd.merge(clone_a.groupby('clonotype_id').genes.unique().to_frame(),
                   clone_b.groupby('clonotype_id').genes.unique().to_frame(),
                   how='outer', left_index=True, right_index=True, suffixes=['_TRA','_TRB'])
clone1

Unnamed: 0_level_0,genes_TRA,genes_TRB
clonotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1
clonotype1,[TRAV30;TRAJ38;CGTEGAGNNRKLIW],[TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF]
clonotype10,[TRAV24;TRAJ37;CACSSSNTGKLIF],[TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF]
clonotype100,[TRAV26-1;TRAJ27;CIVTTNTNAGKSTF],[TRBV7-2;TRBJ1-2;CASSFLTGGNRDYGYTF]
clonotype1000,,[TRBV29-1;TRBJ2-3;CSVVGQEHTDTQYF]
clonotype1001,,[TRBV29-1;TRBJ2-7;CSFQEGGSSYEQYF]
...,...,...
clonotype995,,[TRBV29-1;TRBJ1-3;CSAETGPSGNTIYF]
clonotype996,,[TRBV29-1;TRBJ2-1;CSVRTSGDYNEQFF]
clonotype997,,[TRBV29-1;TRBJ1-1;CSVDDRQGNTEAFF]
clonotype998,,[TRBV29-1;TRBJ2-1;CSVSGTSLYNEQFF]


In [145]:
clone1 = clone1.explode('genes_TRA').explode('genes_TRB')

In [146]:
clone1.loc['clonotype9']

Unnamed: 0_level_0,genes_TRA,genes_TRB
clonotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1
clonotype9,TRAV17;TRAJ35;CATVPGIGFGNVLHC,TRBV7-2;TRBJ2-1;CASSFFSNEQFF
clonotype9,TRAV12-1;TRAJ29;CVVNMGTPLVF,TRBV7-2;TRBJ2-1;CASSFFSNEQFF


In [147]:
clone1[~clone1.index.duplicated(keep=False) & clone1.genes_TRA.isna()]

Unnamed: 0_level_0,genes_TRA,genes_TRB
clonotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1
clonotype1000,,TRBV29-1;TRBJ2-3;CSVVGQEHTDTQYF
clonotype1001,,TRBV29-1;TRBJ2-7;CSFQEGGSSYEQYF
clonotype1002,,TRBV20-1;TRBJ2-1;CSATYSGENNEQFF
clonotype1003,,TRBV20-1;TRBJ2-1;CSATYSGENNEQFF
clonotype1004,,TRBV20-1;TRBJ2-1;CSATYSGENNEQFF
...,...,...
clonotype995,,TRBV29-1;TRBJ1-3;CSAETGPSGNTIYF
clonotype996,,TRBV29-1;TRBJ2-1;CSVRTSGDYNEQFF
clonotype997,,TRBV29-1;TRBJ1-1;CSVDDRQGNTEAFF
clonotype998,,TRBV29-1;TRBJ2-1;CSVSGTSLYNEQFF


In [148]:
clone1['chains'] = clone1.genes_TRA.fillna('') + '|' + clone1.genes_TRB.fillna('')

In [149]:
clone1['clonotype'] = clone1.index.str.strip('clonotype').astype(int)

In [158]:
clone1.sort_values(by='clonotype', inplace=True)

# Only on most abundant chain

In [10]:
df.num_clonotype = df.num_clonotype.astype(int)

In [11]:
#df.num_clonotype = 'c' + df.num_clonotype.astype(int).astype(str)

In [12]:
df['chain_a'] = df.v_gene_TRA.fillna('') + ";" + df.j_gene_TRA.fillna('') + ";" + df.cdr3_TRA.fillna('')
df['chain_b'] = df.v_gene_TRB.fillna('') + ";" + df.j_gene_TRB.fillna('') + ";" + df.cdr3_TRB.fillna('')
df['chains'] = df.chain_a + '|' + df.chain_b

In [332]:
#df[df.genes_TRB.isna()].dropna(subset=['genes_TRA']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})

In [333]:
#df[df.genes_TRA.isna()].dropna(subset=['genes_TRB']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})

In [257]:
#c00 = (df
#       .dropna(subset=['genes_TRA','genes_TRB'])
#       .groupby(['gem','chain_a','chain_b','chains'])
#       .size().reset_index().rename(columns={'gem':'clonotype',0:'gem'}))
#c00

In [258]:
#c00 = df[df.num_clonotype=='c0'].groupby(['num_clonotype','chain_a','chain_b','chains']).gem.size().reset_index().rename(columns={'num_clonotype':'clonotype'})
#c00

In [46]:
#c10 = pd.concat([c10,c00], ignore_index=True)

In [47]:
def get_alpha_pairs(row):
    return ((c10.chain_a.isin([row.chain_a]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_a.isin([row.chain_a])))])

In [48]:
def get_beta_pairs(row):
    return ((c10.chain_b.isin([row.chain_b]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_b.isin([row.chain_b])))])

In [49]:
def get_pairs(row):
    return ((c10.chains.isin([row.chains]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chains.isin([row.chains])))])

In [72]:
def get_pairs(row, ref=c10, var='chains', impute=False):
    # Given data only
    matches = ref[ref[var].isin([row[var]])].clonotype.to_list()
    if (var == 'chains') & (len(matches) == 1) & impute:
        # Only unique matches when imputing c0 with missing chain
        return matches[0]
    elif (var == 'chains') & (len(matches) > 1): # reversed the expression (len(matches) == 1)
        # Len can be one when removing replicas from the reference set
        return matches # returns a list
    elif (var != 'chains') & (len(matches) == 1):
        # we are only interested in the unique matches
        return matches[0] # returns the match
    return np.nan
    #return ((c10.chains.isin([row.chains]) &
    #         (c10.clonotype != row.clonotype)).astype(int).to_list())

### Merge c10 both chains

In [167]:
# Conversion table of clonotypes.
# Keys: "old" clonotype ID
# Vals: convert old cID to this clonotype ID
clonotype_dct = dict()

In [30]:
c10 = df[df.num_clonotype!=0].dropna(subset=['genes_TRA','genes_TRB']).groupby(['num_clonotype','chain_a','chain_b','chains']).gem.size().reset_index().drop_duplicates(subset='num_clonotype', keep='last').rename(columns={'num_clonotype':'clonotype'})
c10#.head(60)

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255
1,2,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF,TRBV28;TRBJ2-7;CASSYVGYEQYF,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF|TRBV28;TRBJ2-7...,54
2,3,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,TRBV7-9;TRBJ1-1;CASSSHDRTGVRTEAFF,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|TRBV7-9;TRBJ...,174
3,4,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF,TRBV5-1;TRBJ2-5;CASSTPSSGPQETQYF,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF|TRBV5-1;TRBJ2-5...,157
4,5,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF,TRBV12-4;TRBJ2-1;CASTTGTSGRDYNEQFF,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF|TRBV12-4;TRBJ2-1...,124
...,...,...,...,...,...
1909,6087,TRAV13-2;TRAJ41;CAENSNSGYALNF,TRBV13;TRBJ1-5;CASSLGTGTGNQPQHF,TRAV13-2;TRAJ41;CAENSNSGYALNF|TRBV13;TRBJ1-5;C...,1
1910,6088,TRAV38-1;TRAJ20;CAFRGPNDYKLSF,TRBV13;TRBJ1-6;CASSPKGTDGNSPLHF,TRAV38-1;TRAJ20;CAFRGPNDYKLSF|TRBV13;TRBJ1-6;C...,1
1911,6090,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF,TRBV13;TRBJ2-4;CASSTGGAAAKNIQYF,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF|TRBV13;TRBJ2...,1
1912,6091,TRAV14/DV4;TRAJ44;CAMREGWGGTASKLTF,TRBV13;TRBJ2-2;CASSLRGAANTGELFF,TRAV14/DV4;TRAJ44;CAMREGWGGTASKLTF|TRBV13;TRBJ...,1


### Merge 10x clonotypes

In [168]:
query = clone1.sort_values(by='clonotype').dropna(subset=['genes_TRA','genes_TRB']).copy()
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=clone1), axis=1)
dct = query.dropna().drop_duplicates(subset='chains').set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
dct = {i:x for x,y in dct.items() for i in y}
clonotype_dct.update(dct)
query[query.clonotype.isin([71, 2707, 9, 191])].dropna(subset=['pairs'])

Unnamed: 0_level_0,genes_TRA,genes_TRB,chains,clonotype,pairs
clonotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
clonotype71,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...,71,"[71, 2707]"
clonotype71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,71,"[71, 2707]"
clonotype2707,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,2707,"[71, 2707]"
clonotype2707,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...,2707,"[71, 2707]"


In [182]:
query = df.loc[df.num_clonotype!=0, ['gem','num_clonotype','chain_a','chain_b','chains','genes_TRA','genes_TRB']].dropna(subset=['genes_TRA','genes_TRB']).rename(columns={'num_clonotype':'clonotype'})
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=clone1), axis=1)
query[query.clonotype.isin([71, 2707, 9, 191])].dropna()

Unnamed: 0,gem,clonotype,chain_a,chain_b,chains,genes_TRA,genes_TRB,pairs
719,AGGCCGTAGGAATGGA-1,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
770,AGGTCCGCAAACTGCT-1,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
810,AGTGGGACACCCTATC-1,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
1067,CAACCTCTCTTTAGTC-1,71,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...,TRAV2;TRAJ20;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
1334,CAGTAACGTATAGTAG-1,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
1700,CCTTACGTCCCATTTA-1,71,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...,TRAV2;TRAJ20;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
2743,GATCAGTAGTCACGCC-1,2707,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
3990,TCAATCTTCAGAGACG-1,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
4198,TCTGAGATCGGGAGTA-1,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"
4700,TTGAACGCACACGCTG-1,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,"[71, 2707]"


In [183]:
df[df.num_clonotype.isin([71, 2707])]

Unnamed: 0,gem,clonotype,num_clonotype,ct,genes_TRA,genes_TRB,genes_lst_TRA,genes_lst_TRB,length_TRA,cdr1_TRA,...,sample_hla,pep_match,hla_match,ct_match,valid_ct,train_label,test_label,chain_a,chain_b,chains
719,AGGCCGTAGGAATGGA-1,clonotype71,71,71.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV3;TRAJ18;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],627.0,VSGNPY,...,"['A0201', 'A0301']",True,True,True,True,True,True,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...
770,AGGTCCGCAAACTGCT-1,clonotype71,71,71.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV3;TRAJ18;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],627.0,VSGNPY,...,"['A0201', 'A0301']",True,True,True,True,True,True,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...
810,AGTGGGACACCCTATC-1,clonotype71,71,71.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV3;TRAJ18;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],627.0,VSGNPY,...,"['A0201', 'A0301']",False,False,True,True,False,False,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...
1067,CAACCTCTCTTTAGTC-1,clonotype71,71,71.0,TRAV2;TRAJ20;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV3;TRAJ18;TRAC' 'TRAV2;TRAJ20;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],518.0,VSNAYN,...,"['A0201', 'A0301']",True,True,True,True,True,True,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...
1334,CAGTAACGTATAGTAG-1,clonotype71,71,71.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV2;TRAJ20;TRAC' 'TRAV3;TRAJ18;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],627.0,VSGNPY,...,"['A0201', 'A0301']",True,True,True,True,True,True,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...
1700,CCTTACGTCCCATTTA-1,clonotype71,71,71.0,TRAV2;TRAJ20;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV2;TRAJ20;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],511.0,VSNAYN,...,"['A0201', 'A0301']",True,True,True,True,True,True,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...
2743,GATCAGTAGTCACGCC-1,clonotype2707,2707,2707.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV2;TRAJ20;TRAC' 'TRAV3;TRAJ18;TRAC'],['TRBV7-6;TRBD1;TRBJ1-2;TRBC1' 'TRBV30;;TRBJ2-...,627.0,VSGNPY,...,,,,,False,,True,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...
3990,TCAATCTTCAGAGACG-1,clonotype71,71,71.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV3;TRAJ18;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],627.0,VSGNPY,...,"['A0201', 'A0301']",True,True,True,True,True,True,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...
4198,TCTGAGATCGGGAGTA-1,clonotype71,71,71.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV2;TRAJ20;TRAC' 'TRAV3;TRAJ18;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],627.0,VSGNPY,...,"['A0201', 'A0301']",False,True,True,True,False,True,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...
4700,TTGAACGCACACGCTG-1,clonotype71,71,71.0,TRAV3;TRAJ18;TRAC,TRBV30;;TRBJ2-3;TRBC2,['TRAV3;TRAJ18;TRAC'],['TRBV30;;TRBJ2-3;TRBC2'],627.0,VSGNPY,...,"['A0201', 'A0301']",True,True,True,True,True,True,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...


In [31]:
#query = c10.copy()
#query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c10), axis=1)
#dct = query.dropna().drop_duplicates(subset='chains').set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
#dct = {i:x for x,y in dct.items() for i in y}
#clonotype_dct.update(dct)
#query.dropna(subset=['pairs'])

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
56,54,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,16,"[54, 364, 2118, 2123, 2127, 2128]"
72,71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,7,"[71, 2707]"
139,136,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPARNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,5,"[136, 215, 2120]"
217,215,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPARNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,3,"[136, 215, 2120]"
332,364,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,1,"[54, 364, 2118, 2123, 2127, 2128]"
333,365,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF,TRBV4-3;TRBJ1-1;CASSPNRNTEAFF,TRAV23/DV6;TRAJ48;CAASIGNFGNEKLTF|TRBV4-3;TRBJ...,2,"[365, 2122, 2124]"
432,501,TRAV38-2/DV8;TRAJ28;CAAGSYQLTF,TRBV7-9;TRBJ2-1;CASSGDSETYNEQFF,TRAV38-2/DV8;TRAJ28;CAAGSYQLTF|TRBV7-9;TRBJ2-1...,1,"[501, 2827]"
615,1193,TRAV12-1;TRAJ49;CVVNMGGNQFYF,TRBV30;TRBJ2-5;CAWKPPGDQETQYF,TRAV12-1;TRAJ49;CVVNMGGNQFYF|TRBV30;TRBJ2-5;CA...,1,"[1193, 1194]"
616,1194,TRAV12-1;TRAJ49;CVVNMGGNQFYF,TRBV30;TRBJ2-5;CAWKPPGDQETQYF,TRAV12-1;TRAJ49;CVVNMGGNQFYF|TRBV30;TRBJ2-5;CA...,1,"[1193, 1194]"
629,1218,TRAV13-2;TRAJ39;CAENNAGNMLTF,TRBV30;TRBJ2-3;CAWSVRGRADTQYF,TRAV13-2;TRAJ39;CAENNAGNMLTF|TRBV30;TRBJ2-3;CA...,1,"[1218, 1219]"


In [133]:
#c10['pairs'] = c10.apply(lambda row: get_pairs(row), axis=1)

In [134]:
#c10['keep'] = c10.dropna(subset=['pairs']).apply(lambda row: row.clonotype == row.pairs[0], axis=1)

In [33]:
# Listing the duplicates
#clonotype_dct = c10[c10.keep == True].set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
# alternative to keep
#query[query.duplicated(subset=['chains'])].set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()
#query.dropna().drop_duplicates(subset='chains').set_index('clonotype').apply(lambda row: row.pairs[1:], axis=1).to_dict()

In [177]:
clone1[clone1.clonotype.isin([71,2707])]#.drop_duplicates(subset=['chains']).head(60)

Unnamed: 0_level_0,genes_TRA,genes_TRB,chains,clonotype
clonotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
clonotype71,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...,71
clonotype71,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,71
clonotype2707,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV30;TRBJ2-3;...,2707
clonotype2707,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF,TRBV7-6;TRBJ1-2;CASSLEGTGGDGYTF,TRAV3;TRAJ18;CAVRDLARGSTLGRLYF|TRBV7-6;TRBJ1-2...,2707
clonotype2707,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV30;TRBJ2-3;CAWSLERDRLKDTQYF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV30;TRBJ2-3;CAWSL...,2707
clonotype2707,TRAV2;TRAJ20;CAVEADDYKLSF,TRBV7-6;TRBJ1-2;CASSLEGTGGDGYTF,TRAV2;TRAJ20;CAVEADDYKLSF|TRBV7-6;TRBJ1-2;CASS...,2707


In [34]:
c10.drop_duplicates(subset=['chains'], inplace=True) # New updated reference

In [37]:
c10

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255
1,2,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF,TRBV28;TRBJ2-7;CASSYVGYEQYF,TRAV8-6;TRAJ45;CAVSDRSGGGADGLTF|TRBV28;TRBJ2-7...,54
2,3,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,TRBV7-9;TRBJ1-1;CASSSHDRTGVRTEAFF,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|TRBV7-9;TRBJ...,174
3,4,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF,TRBV5-1;TRBJ2-5;CASSTPSSGPQETQYF,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF|TRBV5-1;TRBJ2-5...,157
4,5,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF,TRBV12-4;TRBJ2-1;CASTTGTSGRDYNEQFF,TRAV8-3;TRAJ9;CAVVVRNTGGFKTIF|TRBV12-4;TRBJ2-1...,124
...,...,...,...,...,...
1909,6087,TRAV13-2;TRAJ41;CAENSNSGYALNF,TRBV13;TRBJ1-5;CASSLGTGTGNQPQHF,TRAV13-2;TRAJ41;CAENSNSGYALNF|TRBV13;TRBJ1-5;C...,1
1910,6088,TRAV38-1;TRAJ20;CAFRGPNDYKLSF,TRBV13;TRBJ1-6;CASSPKGTDGNSPLHF,TRAV38-1;TRAJ20;CAFRGPNDYKLSF|TRBV13;TRBJ1-6;C...,1
1911,6090,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF,TRBV13;TRBJ2-4;CASSTGGAAAKNIQYF,TRAV14/DV4;TRAJ21;CAMREGLGNFNKFYF|TRBV13;TRBJ2...,1
1912,6091,TRAV14/DV4;TRAJ44;CAMREGWGGTASKLTF,TRBV13;TRBJ2-2;CASSLRGAANTGELFF,TRAV14/DV4;TRAJ44;CAMREGWGGTASKLTF|TRBV13;TRBJ...,1


In [111]:
df.loc[df.num_clonotype.isin(c10.clonotype) & (df.tcr_category == 'multiple chains'),
       ['num_clonotype','ct','genes_lst_TRA','genes_lst_TRB','cdr3_TRA','cdr3_TRB','umi_count_lst_TRA','umi_count_lst_TRB']].sort_values(by='ct')

Unnamed: 0,num_clonotype,ct,genes_lst_TRA,genes_lst_TRB,cdr3_TRA,cdr3_TRB,umi_count_lst_TRA,umi_count_lst_TRB
2614,9,9.0,['TRAV17;TRAJ35;TRAC' 'TRAV12-1;TRAJ29;TRAC'],['TRBV7-2;;TRBJ2-1;TRBC2'],CVVNMGTPLVF,CASSFFSNEQFF,"[3, 4]",[25]
1318,9,9.0,['TRAV17;TRAJ35;TRAC' 'TRAV12-1;TRAJ29;TRAC'],['TRBV7-2;;TRBJ2-1;TRBC2'],CVVNMGTPLVF,CASSFFSNEQFF,"[2, 4]",[10]
1553,9,9.0,['TRAV17;TRAJ35;TRAC' 'TRAV12-1;TRAJ29;TRAC'],['TRBV7-2;;TRBJ2-1;TRBC2'],CVVNMGTPLVF,CASSFFSNEQFF,"[2, 2]",[5]
3353,9,9.0,['TRAV17;TRAJ35;TRAC' 'TRAV12-1;TRAJ29;TRAC'],['TRBV7-2;;TRBJ2-1;TRBC2'],CVVNMGTPLVF,CASSFFSNEQFF,"[1, 2]",[10]
4253,9,9.0,['TRAV12-1;TRAJ29;TRAC' 'TRAV17;TRAJ35;TRAC'],['TRBV7-2;;TRBJ2-1;TRBC2'],CATVPGIGFGNVLHC,CASSFFSNEQFF,"[2, 6]",[17]
...,...,...,...,...,...,...,...,...
4698,5864,5864.0,['TRAV8-2;TRAJ38;TRAC' 'TRAV12-1;TRAJ24;TRAC'],['TRBV19;;TRBJ1-4;TRBC1'],CVVLDGWGKFQF,CASSIPDWTESAGNEKLFF,"[2, 3]",[7]
1182,5867,5867.0,['TRAV25;TRAJ16;TRAC' 'TRAV3;TRAJ20;TRAC'],['TRBV7-8;;TRBJ2-1;TRBC2'],CAVKVRDDYKLSF,CASSLGAWGLMSYNEQFF,"[2, 6]",[28]
1053,5897,5897.0,['TRAV34;TRAJ52;TRAC' 'TRAV29/DV5;TRAJ23;TRAC'],['TRBV3-1;;TRBJ2-2;TRBC2'],CAANPRQGGKLIF,CASSPTSMGYPTNTGELFF,"[4, 5]",[8]
752,6034,6034.0,['TRAV12-2;TRAJ10;TRAC' 'TRAV4;TRAJ30;TRAC'],['TRBV7-7;;TRBJ2-1;TRBC2'],CLVGGRDDKIIF,CASSLSLAGPSSYNEQFF,"[2, 2]",[8]


### Impute c10 alpha
Only unique matches, i.e. only one match in the reference clonotypes!

In [44]:
query = df[(df.num_clonotype!=0) & df.genes_TRB.isna() & ~df.num_clonotype.isin(c10.clonotype)].dropna(subset=['genes_TRA']).groupby(['num_clonotype','chain_a','chain_b','chains']).size().reset_index().rename(columns={'num_clonotype':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, var='chain_a'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_dct.update(dct)
query.dropna()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
3,589,TRAV16;TRAJ18;CALYRLGRLYF,;;,TRAV16;TRAJ18;CALYRLGRLYF|;;,1,23.0
12,622,TRAV26-1;TRAJ20;CIAPDLYDYKLSF,;;,TRAV26-1;TRAJ20;CIAPDLYDYKLSF|;;,1,56.0
13,636,TRAV17;TRAJ43;CATVLDMRF,;;,TRAV17;TRAJ43;CATVLDMRF|;;,1,2609.0
17,662,TRAV12-1;TRAJ11;CVVRYSTLTF,;;,TRAV12-1;TRAJ11;CVVRYSTLTF|;;,1,255.0
18,700,TRAV41;TRAJ49;CALTGNQFYF,;;,TRAV41;TRAJ49;CALTGNQFYF|;;,1,84.0
20,761,TRAV26-1;TRAJ37;CIVRVDSSNTGKLIF,;;,TRAV26-1;TRAJ37;CIVRVDSSNTGKLIF|;;,1,90.0
29,1054,TRAV21;TRAJ26;CAVINYGQNFVF,;;,TRAV21;TRAJ26;CAVINYGQNFVF|;;,1,40.0
30,1055,TRAV21;TRAJ26;CAVINYGQNFVF,;;,TRAV21;TRAJ26;CAVINYGQNFVF|;;,1,40.0
31,1056,TRAV21;TRAJ26;CAVINYGQNFVF,;;,TRAV21;TRAJ26;CAVINYGQNFVF|;;,1,40.0
34,1060,TRAV30;TRAJ47;CGTGEYGNKLVF,;;,TRAV30;TRAJ47;CGTGEYGNKLVF|;;,1,152.0


### Impute c10 beta
Only unique matches, i.e. only one match in the reference clonotypes!

In [49]:
query = df[(df.num_clonotype!=0) & df.genes_TRA.isna() & ~df.num_clonotype.isin(c10.clonotype)].dropna(subset=['genes_TRB']).groupby(['num_clonotype','chain_a','chain_b','chains']).size().reset_index().rename(columns={'num_clonotype':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, var='chain_b'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_dct.update(dct)
query.dropna()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
2,586,;;,TRBV20-1;TRBJ2-3;CSLSGAGYF,;;|TRBV20-1;TRBJ2-3;CSLSGAGYF,1,588.0
4,625,;;,TRBV29-1;TRBJ2-5;CSVPPRETQYF,;;|TRBV29-1;TRBJ2-5;CSVPPRETQYF,1,30.0
7,630,;;,TRBV20-1;TRBJ2-1;CSAIPGNEQFF,;;|TRBV20-1;TRBJ2-1;CSAIPGNEQFF,1,201.0
14,673,;;,TRBV29-1;TRBJ1-2;CSVWDGSLGYTF,;;|TRBV29-1;TRBJ1-2;CSVWDGSLGYTF,1,290.0
18,680,;;,TRBV20-1;TRBJ2-1;CSASRGPDEQFF,;;|TRBV20-1;TRBJ2-1;CSASRGPDEQFF,1,735.0
...,...,...,...,...,...,...
1007,6011,;;,TRBV7-9;TRBJ1-3;CASSYTRQGSSLSFSGNTIYF,;;|TRBV7-9;TRBJ1-3;CASSYTRQGSSLSFSGNTIYF,1,197.0
1008,6012,;;,TRBV7-9;TRBJ1-3;CASSYTRQGSSLSFSGNTIYF,;;|TRBV7-9;TRBJ1-3;CASSYTRQGSSLSFSGNTIYF,1,197.0
1011,6059,;;,TRBV13;TRBJ2-1;CASSLGSSLYNEQFF,;;|TRBV13;TRBJ2-1;CASSLGSSLYNEQFF,1,6065.0
1012,6060,;;,TRBV13;TRBJ2-1;CASSLGSSLYNEQFF,;;|TRBV13;TRBJ2-1;CASSLGSSLYNEQFF,1,6065.0


# Now impute clonotype None

In [51]:
# Convertion table
# GEM to imputed clonotype ID
clonotype_nll = dict()

### Impute c0 beta from 10x

In [52]:
query = df[(df.num_clonotype==0) & df.genes_TRA.isna() & ~df.num_clonotype.isin(c10.clonotype)].dropna(subset=['genes_TRB']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, var='chain_b'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_nll.update(dct)
query

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AAACCTGAGCTGTTCA-1,;;,TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,;;|TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,1,34.0
1,AAACGGGGTACAGTGG-1,;;,TRBV27;TRBJ1-3;CASSSDIYSGNTIYF,;;|TRBV27;TRBJ1-3;CASSSDIYSGNTIYF,1,
2,AAACGGGGTCCGTGAC-1,;;,TRBV7-9;TRBJ1-6;CASSSHDWGGQGSPLHF,;;|TRBV7-9;TRBJ1-6;CASSSHDWGGQGSPLHF,1,77.0
3,AAACGGGGTCGACTGC-1,;;,TRBV6-1;TRBJ2-5;CASNHEYQETQYF,;;|TRBV6-1;TRBJ2-5;CASNHEYQETQYF,1,
4,AAACGGGTCCAGTAGT-1,;;,TRBV7-9;TRBJ2-1;CASSASGQGSYEQFF,;;|TRBV7-9;TRBJ2-1;CASSASGQGSYEQFF,1,
...,...,...,...,...,...,...
839,TTTGGTTAGAAGGGTA-1,;;,TRBV7-9;TRBJ1-6;CASSTHDSEGALSPLHF,;;|TRBV7-9;TRBJ1-6;CASSTHDSEGALSPLHF,1,13.0
840,TTTGGTTAGTCTTGCA-1,;;,TRBV6-2;TRBJ1-1;CASSYLGRQTNTEAFF,;;|TRBV6-2;TRBJ1-1;CASSYLGRQTNTEAFF,1,
841,TTTGTCAAGAATGTTG-1,;;,TRBV6-1;TRBJ2-7;CASSGAPGRNPFYEQYF,;;|TRBV6-1;TRBJ2-7;CASSGAPGRNPFYEQYF,1,
842,TTTGTCAAGCGTGAAC-1,;;,TRBV20-1;TRBJ2-1;CSASRQGGFGNEQFF,;;|TRBV20-1;TRBJ2-1;CSASRQGGFGNEQFF,1,2011.0


In [55]:
c10[c10.clonotype.isin([34,77])]

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
36,34,TRAV5;TRAJ10;CAEILTGGGNKLTF,TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,TRAV5;TRAJ10;CAEILTGGGNKLTF|TRBV11-2;TRBJ2-3;C...,23
76,77,TRAV8-2;TRAJ8;CVVSGFQKLVF,TRBV7-9;TRBJ1-6;CASSSHDWGGQGSPLHF,TRAV8-2;TRAJ8;CVVSGFQKLVF|TRBV7-9;TRBJ1-6;CASS...,1


### Impute c0 alpha from 10x

In [56]:
query = df[(df.num_clonotype==0) & df.genes_TRB.isna() & ~df.num_clonotype.isin(c10.clonotype)].dropna(subset=['genes_TRA']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, var='chain_a'), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_nll.update(dct)
query

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AAACGGGCAGGCGATA-1,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,;;,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|;;,1,3.0
1,AAACGGGGTCATGCCG-1,TRAV29/DV5;TRAJ57;CAAKTQGGSEKLVF,;;,TRAV29/DV5;TRAJ57;CAAKTQGGSEKLVF|;;,1,
2,AAAGATGCATCATCCC-1,TRAV12-2;TRAJ47;CAVNSISGYGNKLVF,;;,TRAV12-2;TRAJ47;CAVNSISGYGNKLVF|;;,1,
3,AAAGCAAAGCTGTCTA-1,TRAV19;TRAJ35;CALSASKGGFGNVLHC,;;,TRAV19;TRAJ35;CALSASKGGFGNVLHC|;;,1,
4,AACACGTCATCACGAT-1,TRAV24;TRAJ22;CARGAAGSARQLTF,;;,TRAV24;TRAJ22;CARGAAGSARQLTF|;;,1,
...,...,...,...,...,...,...
324,TTTATGCCAAGCTGGA-1,TRAV23/DV6;TRAJ48;CAASIGSFGNEKLTF,;;,TRAV23/DV6;TRAJ48;CAASIGSFGNEKLTF|;;,1,
325,TTTGCGCAGTGCTGCC-1,TRAV19;TRAJ33;CALSEAGSNYQLIW,;;,TRAV19;TRAJ33;CALSEAGSNYQLIW|;;,1,27.0
326,TTTGTCAAGGGTCTCC-1,TRAV12-1;TRAJ29;CYPDTPLVF,;;,TRAV12-1;TRAJ29;CYPDTPLVF|;;,1,
327,TTTGTCATCAAACCAC-1,TRAV21;TRAJ26;CAVKATNYGQNFVF,;;,TRAV21;TRAJ26;CAVKATNYGQNFVF|;;,1,19.0


### Impute c0 both chains from 10x

In [75]:
query = df[(df.num_clonotype==0) & ~df.num_clonotype.isin(c10.clonotype)].dropna(subset=['genes_TRA','genes_TRB']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, impute=True), axis=1)
dct = query.dropna().set_index('clonotype').pairs.astype(int).to_dict()
clonotype_nll.update(dct)
query

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AACACGTAGTGGTAGC-1,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF,TRBV5-1;TRBJ2-1;CASSATNEQFF,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF|TRBV5-1;TRBJ2-...,1,
1,AACTCCCGTGATAAAC-1,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF,TRBV11-3;TRBJ2-1;CASSLGPYNEQFF,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF|TRBV11-3;TRB...,1,
2,AACTCTTTCCACGAAT-1,TRAV22;TRAJ16;CAVLTRFSDGQKLLF,TRBV11-3;TRBJ1-2;CASSLDPGGYGYTF,TRAV22;TRAJ16;CAVLTRFSDGQKLLF|TRBV11-3;TRBJ1-2...,1,244.0
3,AACTGGTGTAGCGTCC-1,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF,TRBV7-8;TRBJ2-1;CASTLSSGLAGGRGNEQFF,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF|TRBV7-8;TRB...,1,
4,AAGCCGCTCGCCTGAG-1,TRAV1-2;TRAJ33;CAVRDSNYQLIW,TRBV5-1;TRBJ2-7;CASSLEGQASSYEQYF,TRAV1-2;TRAJ33;CAVRDSNYQLIW|TRBV5-1;TRBJ2-7;CA...,1,
...,...,...,...,...,...,...
200,TTAGGCACAAGCCTAT-1,TRAV22;TRAJ56;CGRGKLTF,TRBV6-5;TRBJ2-7;CASSWGAVSYEQYF,TRAV22;TRAJ56;CGRGKLTF|TRBV6-5;TRBJ2-7;CASSWGA...,1,
201,TTCCCAGCATTTGCCC-1,TRAV22;TRAJ26;CAVDNYGQNFVF,TRBV27;TRBJ2-3;CASSPSPRGFTDTQYF,TRAV22;TRAJ26;CAVDNYGQNFVF|TRBV27;TRBJ2-3;CASS...,1,
202,TTCGAAGAGACCACGA-1,TRAV10;TRAJ50;CVVSGSYDKVIF,TRBV27;TRBJ2-1;CASSLFGLAETTNEQFF,TRAV10;TRAJ50;CVVSGSYDKVIF|TRBV27;TRBJ2-1;CASS...,1,
203,TTCTACAAGTTGTAGA-1,TRAV12-1;TRAJ43;CVVNPLYKSMRF,TRBV4-3;TRBJ1-1;CASSPSRNTEAFF,TRAV12-1;TRAJ43;CVVNPLYKSMRF|TRBV4-3;TRBJ1-1;C...,1,


# Remaining Null clonotypes

In [95]:
# Dictionary of all GEMs that contain TCR replicas
# Key GEM that matches with value GEM
# Later replace the key values with a novel clonotype and inverse the dict?
gem_nll = dict()

### Merge c0 both

In [96]:
c00 = df[(df.num_clonotype==0) & ~df.gem.isin(clonotype_nll.keys())].dropna(subset=['genes_TRA','genes_TRB']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})
query = c00.copy()
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c00, impute=False), axis=1)
#c00 = query.copy()
dct = query.dropna().set_index('clonotype').pairs.to_dict()
gem_nll.update(dct)
query

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
0,AACACGTAGTGGTAGC-1,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF,TRBV5-1;TRBJ2-1;CASSATNEQFF,TRAV38-2/DV8;TRAJ43;CAYQWGGDMRF|TRBV5-1;TRBJ2-...,1,
1,AACTCCCGTGATAAAC-1,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF,TRBV11-3;TRBJ2-1;CASSLGPYNEQFF,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF|TRBV11-3;TRB...,1,
2,AACTGGTGTAGCGTCC-1,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF,TRBV7-8;TRBJ2-1;CASTLSSGLAGGRGNEQFF,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF|TRBV7-8;TRB...,1,
3,AAGCCGCTCGCCTGAG-1,TRAV1-2;TRAJ33;CAVRDSNYQLIW,TRBV5-1;TRBJ2-7;CASSLEGQASSYEQYF,TRAV1-2;TRAJ33;CAVRDSNYQLIW|TRBV5-1;TRBJ2-7;CA...,1,
4,AAGGTTCGTCAGCTAT-1,TRAV13-1;TRAJ23;CAANYNQGGKLIF,TRBV27;TRBJ2-5;CASSFTVEETQYF,TRAV13-1;TRAJ23;CAANYNQGGKLIF|TRBV27;TRBJ2-5;C...,1,
...,...,...,...,...,...,...
146,TTAGGACTCTAACGGT-1,TRAV17;TRAJ57;CATDAKNRGSEKLVF,TRBV4-1;TRBJ2-3;CASSQDRGADTQYF,TRAV17;TRAJ57;CATDAKNRGSEKLVF|TRBV4-1;TRBJ2-3;...,1,
147,TTAGGCACAAGCCTAT-1,TRAV22;TRAJ56;CGRGKLTF,TRBV6-5;TRBJ2-7;CASSWGAVSYEQYF,TRAV22;TRAJ56;CGRGKLTF|TRBV6-5;TRBJ2-7;CASSWGA...,1,"[CTCTACGCAGATAATG-1, TTAGGCACAAGCCTAT-1]"
148,TTCCCAGCATTTGCCC-1,TRAV22;TRAJ26;CAVDNYGQNFVF,TRBV27;TRBJ2-3;CASSPSPRGFTDTQYF,TRAV22;TRAJ26;CAVDNYGQNFVF|TRBV27;TRBJ2-3;CASS...,1,
149,TTCGAAGAGACCACGA-1,TRAV10;TRAJ50;CVVSGSYDKVIF,TRBV27;TRBJ2-1;CASSLFGLAETTNEQFF,TRAV10;TRAJ50;CVVSGSYDKVIF|TRBV27;TRBJ2-1;CASS...,1,


### Merge c0 both chains

In [97]:
# Same same but different..
#lst = query[query.pairs.isna() & query.chains.duplicated(keep=False)].sort_values(by='chains').groupby('chains').clonotype.apply(list).values
#dct = {x[0]:x for x in lst}
##gem_nll.update(dct)
#dct

### Impute c0 beta
Find complete c0 TCRs that match on the beta chain

In [98]:
query = df[(df.num_clonotype==0) & df.genes_TRA.isna() & ~df.gem.isin(clonotype_nll.keys())].dropna(subset=['genes_TRB']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c00, var='chain_b'), axis=1)
#dct = query.dropna().set_index('clonotype').pairs.to_dict()
lst = query.dropna().apply(lambda row: [row.clonotype, row.pairs], axis=1).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)
query.dropna()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs
5,AACACGTCACCTATCC-1,;;,TRBV28;TRBJ2-7;CASSLNIGSSYEQYF,;;|TRBV28;TRBJ2-7;CASSLNIGSSYEQYF,1,CTCGAGGTCAGCACAT-1
28,ACCAGTATCAGTTAGC-1,;;,TRBV29-1;TRBJ1-2;CSVEGYWPGAGYGYTF,;;|TRBV29-1;TRBJ1-2;CSVEGYWPGAGYGYTF,1,CCTAAAGGTCCGTTAA-1
29,ACCCACTGTCACTGGC-1,;;,TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,;;|TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,1,CCACGGAAGGCGTACA-1
51,ACTTACTGTATCTGCA-1,;;,TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,;;|TRBV27;TRBJ2-2;CASSPDRSRANTGELFF,1,CCACGGAAGGCGTACA-1
69,AGGGATGAGTCATGCT-1,;;,TRBV7-9;TRBJ2-7;CASREGLQYEQYF,;;|TRBV7-9;TRBJ2-7;CASREGLQYEQYF,1,CTGTTTAAGAACTCGG-1
71,AGGTCATTCCCAAGAT-1,;;,TRBV9;TRBJ2-3;CASSGGGGTGTGDTDTQYF,;;|TRBV9;TRBJ2-3;CASSGGGGTGTGDTDTQYF,1,GGACGTCGTCTGGAGA-1
77,ATCCGAAAGTCGAGTG-1,;;,TRBV7-9;TRBJ2-7;CASREGLQYEQYF,;;|TRBV7-9;TRBJ2-7;CASREGLQYEQYF,1,CTGTTTAAGAACTCGG-1
101,CAGCTGGTCTGGTTCC-1,;;,TRBV27;TRBJ2-2;CASSLFGLKNTGELFF,;;|TRBV27;TRBJ2-2;CASSLFGLKNTGELFF,1,AGCGTATTCTCTGTCG-1
104,CATCAGAAGGCGACAT-1,;;,TRBV3-1;TRBJ2-1;CASSRPGGPGSYNEQFF,;;|TRBV3-1;TRBJ2-1;CASSRPGGPGSYNEQFF,1,AGCCTAATCTGCGTAA-1
121,CCGGGATTCCTATTCA-1,;;,TRBV3-1;TRBJ2-1;CASSRPGGPGSYNEQFF,;;|TRBV3-1;TRBJ2-1;CASSRPGGPGSYNEQFF,1,AGCCTAATCTGCGTAA-1


### Merge c0 beta

In [99]:
lst = query[query.pairs.isna() & query.chain_b.duplicated(keep=False)].sort_values(by='chain_b').groupby('chain_b').clonotype.apply(list).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)

### Impute c0 alpha

In [100]:
query = df[(df.num_clonotype==0) & df.genes_TRB.isna() & ~df.gem.isin(clonotype_nll.keys())].dropna(subset=['genes_TRA']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})
query['pairs'] = query.apply(lambda row: get_pairs(row, ref=c00, var='chain_a'), axis=1)
#dct = query.dropna().set_index('clonotype').pairs.to_dict()
lst = query.dropna().apply(lambda row: [row.clonotype, row.pairs], axis=1).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)

### merge replicas of c0 alpha

In [101]:
lst = query[query.pairs.isna() & query.chain_a.duplicated(keep=False)].sort_values(by='chain_a').groupby('chain_a').clonotype.apply(list).values
dct = {x[0]:x for x in lst}
gem_nll.update(dct)

In [102]:
gem_nll

{'ACTTGTTAGGCATTGG-1': ['ACTTGTTAGGCATTGG-1', 'TTAACTCTCATGTCCC-1'],
 'AGGGTGACACTATCTT-1': ['AGGGTGACACTATCTT-1', 'GAGGTGAGTATATGAG-1'],
 'CCCAGTTTCATAGCAC-1': ['CCCAGTTTCATAGCAC-1', 'TGATTTCCAACTGGCC-1'],
 'CCGGGATTCACGCATA-1': ['CCGGGATTCACGCATA-1', 'CGTTGGGGTGATGATA-1'],
 'CGGACACGTGTGACGA-1': ['CGGACACGTGTGACGA-1', 'GATCGTACATACCATG-1'],
 'CGTTGGGGTGATGATA-1': ['CCGGGATTCACGCATA-1', 'CGTTGGGGTGATGATA-1'],
 'CTCGGGATCTCAACTT-1': ['CTCGGGATCTCAACTT-1', 'TCGTAGAAGGGAACGG-1'],
 'CTCTACGCAGATAATG-1': ['CTCTACGCAGATAATG-1', 'TTAGGCACAAGCCTAT-1'],
 'GAGGTGAGTATATGAG-1': ['AGGGTGACACTATCTT-1', 'GAGGTGAGTATATGAG-1'],
 'GATCGTACATACCATG-1': ['CGGACACGTGTGACGA-1', 'GATCGTACATACCATG-1'],
 'TCGTAGAAGGGAACGG-1': ['CTCGGGATCTCAACTT-1', 'TCGTAGAAGGGAACGG-1'],
 'TGATTTCCAACTGGCC-1': ['CCCAGTTTCATAGCAC-1', 'TGATTTCCAACTGGCC-1'],
 'TTAACTCTCATGTCCC-1': ['ACTTGTTAGGCATTGG-1', 'TTAACTCTCATGTCCC-1'],
 'TTAGGCACAAGCCTAT-1': ['CTCTACGCAGATAATG-1', 'TTAGGCACAAGCCTAT-1'],
 'AACACGTCACCTATCC-1': ['AACACGTCA

In [188]:
c10[c10.clonotype.isin([23])]

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs,keep
24,23,TRAV16;TRAJ18;CALYRLGRLYF,TRBV14;TRBJ2-7;CASSQDYSSSYEQYF,TRAV16;TRAJ18;CALYRLGRLYF|TRBV14;TRBJ2-7;CASSQ...,35,,


In [51]:
#c10['pairs'] = c10.apply(lambda row: c10.loc[c10.chain_a.isin([row.chain_a]) &
#                                             (c10.clonotype != row.clonotype), 'clonotype'].to_list(), axis=1)

In [79]:
#c10['clones'] = c10.apply(lambda row: c10.clonotype.to_list() + ['c0'], axis=1)
c10['clones'] = c10.apply(lambda row: c10.clonotype.to_list(), axis=1)

In [101]:
lol = c10.explode(['pairs','clones'])
lol[lol.pairs > 0].groupby('clonotype').clones.unique()#.apply(lambda row: set(row.clones))#.update(set(row.clonotype))

clonotype
c1193                               [c1194]
c1194                               [c1193]
c1218                               [c1219]
c1219                               [c1218]
c136                          [c2120, c215]
c2032                               [c2033]
c2033                               [c2032]
c2118      [c2123, c2127, c2128, c364, c54]
c2120                          [c136, c215]
c2122                         [c2124, c365]
c2123      [c2118, c2127, c2128, c364, c54]
c2124                         [c2122, c365]
c2127      [c2118, c2123, c2128, c364, c54]
c2128      [c2118, c2123, c2127, c364, c54]
c215                          [c136, c2120]
c2707                                 [c71]
c2827                                [c501]
c2901                               [c2902]
c2902                               [c2901]
c2936                               [c5483]
c364      [c2118, c2123, c2127, c2128, c54]
c365                         [c2122, c2124]
c501                  

In [94]:
df[['ct','peptide_HLA','peptide_HLA_lst','umi_count_lst_mhc','VDJdb_check','VDJdb_pep','ct_pep','ct_hla','genes_lst_TRA','genes_lst_TRB']].dropna(subset=['VDJdb_check'])

Unnamed: 0,ct,peptide_HLA,peptide_HLA_lst,umi_count_lst_mhc,VDJdb_check,VDJdb_pep,ct_pep,ct_hla,genes_lst_TRA,genes_lst_TRB
626,2840.0,FLYALALLL A0201,[FLYALALLL A0201],[17.0],True,['FLYALALLL'],,,['TRAV17;TRAJ11;TRAC'],['TRBV6-5;;TRBJ1-2;TRBC1']
1172,162.0,FLYALALLL A0201,"[CLGGLLTMV A0201, FLYALALLL A0201]","[1.0, 38.0]",True,['FLYALALLL'],FLYALALLL A0201,,['TRAV17;TRAJ11;TRAC'],['TRBV6-5;;TRBJ1-2;TRBC1']
1958,162.0,FLYALALLL A0201,"[CLGGLLTMV A0201, RVRAYTYSK A0301, TPSVSSSISSL...","[1.0, 1.0, 1.0, 1.0, 22.0]",True,['FLYALALLL'],FLYALALLL A0201,,['TRAV17;TRAJ11;TRAC'],['TRBV6-5;;TRBJ1-2;TRBC1']
2155,5316.0,NLVPMVATV A0201,"[CLGGLLTMV A0201, RVRAYTYSK A0301, NLVPMVATV A...","[2.0, 21.0, 26.0]",True,['NLVPMVATV'],,,['TRAV26-2;TRAJ43;TRAC'],['TRBV7-6;;TRBJ1-4;TRBC1']
2183,162.0,FLYALALLL A0201,"[CLGGLLTMV A0201, RVRAYTYSK A0301, FLYALALLL A...","[2.0, 2.0, 46.0]",True,['FLYALALLL'],FLYALALLL A0201,,['TRAV17;TRAJ11;TRAC'],['TRBV6-5;;TRBJ1-2;TRBC1']
2208,883.0,GLCTLVAML A0201,"[RVRAYTYSK A0301, GLCTLVAML A0201]","[9.0, 23.0]",True,['GLCTLVAML'],,,['TRAV5;TRAJ31;TRAC'],['TRBV20-1;TRBD1;TRBJ1-2;TRBC1']
2695,3697.0,GLCTLVAML A0201,"[RVRAYTYSK A0301, GLCTLVAML A0201]","[6.0, 23.0]",True,['GLCTLVAML'],,,['TRAV9-2;TRAJ34;TRAC'],['TRBV3-1;;TRBJ2-7;TRBC2']
3435,162.0,FLYALALLL A0201,"[YVLDHLIVV A0201, TPRVTGGGAM B0702, FLYALALLL ...","[1.0, 1.0, 33.0]",True,['FLYALALLL'],FLYALALLL A0201,,['TRAV17;TRAJ11;TRAC'],['TRBV6-5;;TRBJ1-2;TRBC1']
3660,5317.0,NLVPMVATV A0201,[NLVPMVATV A0201],[24.0],True,['NLVPMVATV'],,,['TRAV26-2;TRAJ43;TRAC'],['TRBV7-6;;TRBJ1-4;TRBC1']
3909,1178.0,RVRAYTYSK A0301,"[GLCTLVAML A0201, RVRAYTYSK A0301]","[8.0, 11.0]",False,['GLCTLVAML'],,,['TRAV5;TRAJ37;TRAC'],['TRBV29-1;;TRBJ1-4;TRBC1']


In [53]:
tmp = pd.DataFrame(columns=['clonotype','pairs','clones'])

In [54]:
tmp['clonotype'] = ['c0']

In [55]:
# Count matches of clonotype 0 with the true clonotypes + itself
tmp['pairs'] =  [c10.chains.isin(c00.chains).astype(int).to_list() + [0]] #[c10.chain_b.isin(c00.chain_b).astype(int).to_list() + [0]]

In [56]:
# List the true clonotypes
tmp['clones'] = [c10.clonotype.to_list() + ['c0']]

In [57]:
tmp

Unnamed: 0,clonotype,pairs,clones
0,c0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[c1, c10, c100, c1000, c1005, c1006, c101, c10..."


In [58]:
c = pd.concat([c10,tmp], ignore_index=True)

In [59]:
m = c.explode(['pairs','clones'])
m

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs,clones
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c1
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c10
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c100
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c1000
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c1005
...,...,...,...,...,...,...,...
3025,c0,,,,,0,c993
3025,c0,,,,,0,c996
3025,c0,,,,,0,c997
3025,c0,,,,,0,c998


In [61]:
m[(m.pairs > 0) & (m.clonotype.isin(['c10','c2471','c3001']))]

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs,clones
1,c10,TRAV24;TRAJ37;CACSSSNTGKLIF,TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,TRAV24;TRAJ37;CACSSSNTGKLIF|TRBV4-1;TRBJ1-2;CA...,37.0,1,c0
830,c2471,;;,TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,;;|TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,1.0,1,c0


In [43]:
df.loc[df.clonotype.isin(['clonotype10','clonotype2471','clonotype3001']), ['clonotype','ct','genes_lst_TRA','genes_lst_TRB','cdr3_TRA','cdr3_TRB']]

Unnamed: 0,clonotype,ct,genes_lst_TRA,genes_lst_TRB,cdr3_TRA,cdr3_TRB
363,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
448,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
762,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
901,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
917,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
...,...,...,...,...,...,...
6978,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF
7006,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF
7030,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF
7057,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF


In [42]:
df.loc[df.clonotype.isin(['clonotype10','clonotype2471','clonotype3001']), ['clonotype','ct','genes_lst_TRA','genes_lst_TRB','cdr3_TRA','cdr3_TRB']]

Unnamed: 0,clonotype,ct,genes_lst_TRA,genes_lst_TRB,cdr3_TRA,cdr3_TRB
4251,clonotype3001,3001.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CARSSSNTGKLIF,CASSQDRLTGGYTF
6443,clonotype2471,2471.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF


In [29]:
out = c.explode(['pairs','clones']).pivot(index='clonotype',columns='clones', values='pairs').replace(0, np.nan).reset_index().rename(columns={'clonotype':'from'})

In [316]:
#out.to_csv('arc_data.b.csv', index=False)

In [34]:
out.dropna(how='all')

clones,from,c0,c1,c10,c100,c1000,c1005,c1006,c101,c1011,...,c984,c987,c988,c99,c991,c992,c993,c996,c997,c998
0,c0,,1.0,1.0,,,,,,,...,,,,,,,,,,
1,c1,1.0,,,,,,,,,...,,,,,,,,,,
2,c10,1.0,,,,,,,,,...,,,,,,,,,,
3,c100,,,,,,,,,,...,,,,,,,,,,
4,c1000,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3021,c992,,,,,,,,,,...,,,,,1.0,,,,,
3022,c993,,,,,,,,,,...,,,,,,,,,,
3023,c996,,,,,,,,,,...,,,,,,,,,,
3024,c997,,,,,,,,,,...,,,,,,,,,,
