In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from ast import literal_eval
import seaborn as sns

In [2]:
import sys  
sys.path.insert(0, '../scripts')

from D_plot_specificity_matrix_utils import (peptide_per_clonotype_by_gem_size,
                                             multiple_peptides_per_gem_w_filtering,
                                             calc_binding_concordance,
                                             epitope_sorter_index,
                                             peptides_per_gem)

In [2]:
sns.set_style('ticks', {'axes.edgecolor': '0',  
                        'xtick.color': '0',
                        'ytick.color': '0'})

In [3]:
def HLA_cd8_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace(",", "").replace("'","").split(" ")

def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def epitope_converter(x):
    #define format of datetime
    return [y for y in x.replace("[","").replace("]","").replace("\n","").split("'") if (y != '') & (y != ' ')]

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("\n","").replace("'",""))

def literal_converter(val):
    # replace NaN with '' and perform literal eval on the rest
    return [] if val == '' else literal_eval(val)

converters = {'peptide_HLA_lst': peptide_hla_converter,
              'umi_count_lst_mhc': literal_eval,
              'umi_count_lst_TRA': literal_converter,'umi_count_lst_TRB': literal_converter,
              'cdr3_lst_TRA': cdr3_lst_converter,
              'cdr3_lst_TRB': cdr3_lst_converter,
              'HLA_lst_mhc': cdr3_lst_converter,'HLA_cd8': HLA_cd8_converter} #

In [4]:
def notnan(x):
    return x == x

In [5]:
def get_multiplets(df):
    #tmp = df[idx1 & idx2]
    dct = df.groupby(['ct','peptide_HLA']).gem.count() > 1
    idx = df.set_index(['ct','peptide_HLA']).index.map(dct)
    return idx.fillna(False)

# Input

In [6]:
VALID = '../experiments/exp13/run1_archive/cat/eval_clonotypes/valid_ct.csv'
#OS2 = '../experiments/exp13/run2/cat/eval_clonotypes/valid_ct.csv'

# Load

In [7]:
df = pd.read_csv(VALID, converters=converters)

In [8]:
df.fillna({'umi_count_mhc':0, 'delta_umi_mhc':0, 'umi_count_mhc_rel':0,
           'umi_count_cd8':0, 'delta_umi_cd8':0,
           'umi_count_TRA':0, 'delta_umi_TRA':0,
           'umi_count_TRB':0, 'delta_umi_TRB':0,
           'cdr3_TRA':'','cdr3_TRB':''}, inplace=True)

# Only on most abundant chain

In [9]:
df.num_clonotype = 'c' + df.num_clonotype.astype(int).astype(str)

In [10]:
df['chain_a'] = df.v_gene_TRA.fillna('') + ";" + df.j_gene_TRA.fillna('') + ";" + df.cdr3_TRA.fillna('')
df['chain_b'] = df.v_gene_TRB.fillna('') + ";" + df.j_gene_TRB.fillna('') + ";" + df.cdr3_TRB.fillna('')
df['chains'] = df.chain_a + '|' + df.chain_b

In [44]:
c10 = df[df.num_clonotype!='c0'].groupby(['num_clonotype','chain_a','chain_b','chains']).gem.size().reset_index().drop_duplicates(subset='num_clonotype', keep='last').rename(columns={'num_clonotype':'clonotype'})
c10#.head(60)

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
1,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255
3,c10,TRAV24;TRAJ37;CACSSSNTGKLIF,TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,TRAV24;TRAJ37;CACSSSNTGKLIF|TRBV4-1;TRBJ1-2;CA...,37
4,c100,TRAV26-1;TRAJ27;CIVTTNTNAGKSTF,TRBV7-2;TRBJ1-2;CASSFLTGGNRDYGYTF,TRAV26-1;TRAJ27;CIVTTNTNAGKSTF|TRBV7-2;TRBJ1-2...,7
5,c1000,;;,TRBV29-1;TRBJ2-3;CSVVGQEHTDTQYF,;;|TRBV29-1;TRBJ2-3;CSVVGQEHTDTQYF,1
6,c1005,;;,TRBV20-1;TRBJ2-2;CSAPDRGRIGELFF,;;|TRBV20-1;TRBJ2-2;CSAPDRGRIGELFF,1
...,...,...,...,...,...
3073,c992,;;,TRBV30;TRBJ1-1;CAWSAYSESAEAFF,;;|TRBV30;TRBJ1-1;CAWSAYSESAEAFF,1
3074,c993,;;,TRBV30;TRBJ2-7;CAWSGARGPYEQYF,;;|TRBV30;TRBJ2-7;CAWSGARGPYEQYF,1
3075,c996,;;,TRBV29-1;TRBJ2-1;CSVRTSGDYNEQFF,;;|TRBV29-1;TRBJ2-1;CSVRTSGDYNEQFF,1
3076,c997,;;,TRBV29-1;TRBJ1-1;CSVDDRQGNTEAFF,;;|TRBV29-1;TRBJ1-1;CSVDDRQGNTEAFF,1


In [69]:
df[df.genes_TRB.isna()].dropna(subset=['genes_TRA']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,AAACGGGCAGGCGATA-1,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,;;,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|;;,1
1,AAACGGGGTCATGCCG-1,TRAV29/DV5;TRAJ57;CAAKTQGGSEKLVF,;;,TRAV29/DV5;TRAJ57;CAAKTQGGSEKLVF|;;,1
2,AAAGATGCATCATCCC-1,TRAV12-2;TRAJ47;CAVNSISGYGNKLVF,;;,TRAV12-2;TRAJ47;CAVNSISGYGNKLVF|;;,1
3,AAAGATGTCATGGTCA-1,TRAV3;TRAJ15;CAVRDITGQAGTALIF,;;,TRAV3;TRAJ15;CAVRDITGQAGTALIF|;;,1
4,AAAGCAAAGACAGACC-1,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF,;;,TRAV38-2/DV8;TRAJ58;CAYSWETSGSRLTF|;;,1
...,...,...,...,...,...
486,TTTGCGCAGTGCTGCC-1,TRAV19;TRAJ33;CALSEAGSNYQLIW,;;,TRAV19;TRAJ33;CALSEAGSNYQLIW|;;,1
487,TTTGTCAAGGGTCTCC-1,TRAV12-1;TRAJ29;CYPDTPLVF,;;,TRAV12-1;TRAJ29;CYPDTPLVF|;;,1
488,TTTGTCAGTTCCACAA-1,TRAV3;TRAJ15;CAVRDITGQAGTALIF,;;,TRAV3;TRAJ15;CAVRDITGQAGTALIF|;;,1
489,TTTGTCATCAAACCAC-1,TRAV21;TRAJ26;CAVKATNYGQNFVF,;;,TRAV21;TRAJ26;CAVKATNYGQNFVF|;;,1


In [70]:
df[df.genes_TRA.isna()].dropna(subset=['genes_TRB']).groupby(['gem','chain_a','chain_b','chains']).size().reset_index().rename(columns={'gem':'clonotype',0:'gem'})

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,AAACCTGAGACATAAC-1,;;,TRBV27;TRBJ2-7;CQSRAGIAGGIYEQYF,;;|TRBV27;TRBJ2-7;CQSRAGIAGGIYEQYF,1
1,AAACCTGAGATGTCGG-1,;;,TRBV7-2;TRBJ1-1;CASSLPGTGVNTEAFF,;;|TRBV7-2;TRBJ1-1;CASSLPGTGVNTEAFF,1
2,AAACCTGAGCTGTTCA-1,;;,TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,;;|TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,1
3,AAACCTGAGTGAACAT-1,;;,TRBV7-8;TRBJ2-1;CASSLGVGQGYNEQFF,;;|TRBV7-8;TRBJ2-1;CASSLGVGQGYNEQFF,1
4,AAACCTGGTCAATGTC-1,;;,TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,;;|TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,1
...,...,...,...,...,...
2274,TTTGGTTGTTGAGGTG-1,;;,TRBV28;TRBJ2-7;CASSYVGYEQYF,;;|TRBV28;TRBJ2-7;CASSYVGYEQYF,1
2275,TTTGTCAAGAATGTTG-1,;;,TRBV6-1;TRBJ2-7;CASSGAPGRNPFYEQYF,;;|TRBV6-1;TRBJ2-7;CASSGAPGRNPFYEQYF,1
2276,TTTGTCAAGCGTGAAC-1,;;,TRBV20-1;TRBJ2-1;CSASRQGGFGNEQFF,;;|TRBV20-1;TRBJ2-1;CSASRQGGFGNEQFF,1
2277,TTTGTCATCACCTCGT-1,;;,TRBV19;TRBJ1-5;CASSSTGGNQPQHF,;;|TRBV19;TRBJ1-5;CASSSTGGNQPQHF,1


In [72]:
c00 = (df
       .dropna(subset=['genes_TRA','genes_TRB'])
       .groupby(['gem','chain_a','chain_b','chains'])
       .size().reset_index().rename(columns={'gem':'clonotype',0:'gem'}))
c00

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,AAACCTGAGCCCAGCT-1,TRAV8-3;TRAJ10;CAVGVRGGGNKLTF,TRBV6-5;TRBJ1-2;CASSVTGPPREDGYTF,TRAV8-3;TRAJ10;CAVGVRGGGNKLTF|TRBV6-5;TRBJ1-2;...,1
1,AAACCTGAGTCAATAG-1,TRAV17;TRAJ38;CATFNAGNNRKLIW,TRBV19;TRBJ2-7;CASSLVAGGHEQYF,TRAV17;TRAJ38;CATFNAGNNRKLIW|TRBV19;TRBJ2-7;CA...,1
2,AAACCTGCAATCCGAT-1,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF,TRBV5-1;TRBJ2-5;CASSTPSSGPQETQYF,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF|TRBV5-1;TRBJ2-5...,1
3,AAACCTGCAGCCAGAA-1,TRAV21;TRAJ33;CAVLMDSNYQLIW,TRBV10-2;TRBJ1-1;CASSADGMNTEAFF,TRAV21;TRAJ33;CAVLMDSNYQLIW|TRBV10-2;TRBJ1-1;C...,1
4,AAACCTGCATGCCACG-1,TRAV17;TRAJ12;CATVVRMDSSYKLIF,TRBV7-9;TRBJ2-1;CASSLIGQGKKDEQFF,TRAV17;TRAJ12;CATVVRMDSSYKLIF|TRBV7-9;TRBJ2-1;...,1
...,...,...,...,...,...
4336,TTTGTCACAAGCGAGT-1,TRAV16;TRAJ18;CALYRLGRLYF,TRBV14;TRBJ2-7;CASSQDYSSSYEQYF,TRAV16;TRAJ18;CALYRLGRLYF|TRBV14;TRBJ2-7;CASSQ...,1
4337,TTTGTCACACCGATAT-1,TRAV5;TRAJ10;CAEILTGGGNKLTF,TRBV11-2;TRBJ2-3;CASSLQTGRTDTQYF,TRAV5;TRAJ10;CAEILTGGGNKLTF|TRBV11-2;TRBJ2-3;C...,1
4338,TTTGTCATCAGCCTAA-1,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF,TRBV5-1;TRBJ2-5;CASSTPSSGPQETQYF,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF|TRBV5-1;TRBJ2-5...,1
4339,TTTGTCATCCCAACGG-1,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF,TRBV7-9;TRBJ1-1;CASSSHDRTGVRTEAFF,TRAV14/DV4;TRAJ12;CAMREGMDSSYKLIF|TRBV7-9;TRBJ...,1


In [45]:
#c00 = df[df.num_clonotype=='c0'].groupby(['num_clonotype','chain_a','chain_b','chains']).gem.size().reset_index().rename(columns={'num_clonotype':'clonotype'})
#c00

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,c0,;;,TRBV10-1;TRBJ1-1;CASSLTANTEAFF,;;|TRBV10-1;TRBJ1-1;CASSLTANTEAFF,1
1,c0,;;,TRBV10-1;TRBJ2-1;CASSDLANEQFF,;;|TRBV10-1;TRBJ2-1;CASSDLANEQFF,1
2,c0,;;,TRBV10-2;TRBJ1-1;CASSADGMNTEAFF,;;|TRBV10-2;TRBJ1-1;CASSADGMNTEAFF,1
3,c0,;;,TRBV10-2;TRBJ1-1;CASSDDGMNTEAFF,;;|TRBV10-2;TRBJ1-1;CASSDDGMNTEAFF,1
4,c0,;;,TRBV10-3;TRBJ2-5;CAISEGRETQYF,;;|TRBV10-3;TRBJ2-5;CAISEGRETQYF,2
...,...,...,...,...,...
793,c0,TRAV9-2;TRAJ20;CALTSRSNDYKLSF,TRBV29-1;TRBJ2-5;CSVPPRETQYF,TRAV9-2;TRAJ20;CALTSRSNDYKLSF|TRBV29-1;TRBJ2-5...,1
794,c0,TRAV9-2;TRAJ29;CALPHSGNTPLVF,TRBV27;TRBJ2-3;CASSLNPWAPSTDTQYF,TRAV9-2;TRAJ29;CALPHSGNTPLVF|TRBV27;TRBJ2-3;CA...,1
795,c0,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF,TRBV11-3;TRBJ2-1;CASSLGPYNEQFF,TRAV9-2;TRAJ45;CALSESMYSGGGADGLTF|TRBV11-3;TRB...,1
796,c0,TRAV9-2;TRAJ52;CARRNAGGTSYGKLTF,;;,TRAV9-2;TRAJ52;CARRNAGGTSYGKLTF|;;,1


In [46]:
#c10 = pd.concat([c10,c00], ignore_index=True)

In [47]:
def get_alpha_pairs(row):
    return ((c10.chain_a.isin([row.chain_a]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_a.isin([row.chain_a])))])

In [48]:
def get_beta_pairs(row):
    return ((c10.chain_b.isin([row.chain_b]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_b.isin([row.chain_b])))])

In [49]:
def get_pairs(row):
    return ((c10.chains.isin([row.chains]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chains.isin([row.chains])))])

In [50]:
c10['pairs'] = c10.apply(lambda row: get_pairs(row), axis=1)

In [51]:
#c10['pairs'] = c10.apply(lambda row: c10.loc[c10.chain_a.isin([row.chain_a]) &
#                                             (c10.clonotype != row.clonotype), 'clonotype'].to_list(), axis=1)

In [52]:
c10['clones'] = c10.apply(lambda row: c10.clonotype.to_list() + ['c0'], axis=1)

In [75]:
lol = c10.explode(['pairs','clones'])
lol[lol.pairs > 0]

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs,clones
3,c10,TRAV24;TRAJ37;CACSSSNTGKLIF,TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,TRAV24;TRAJ37;CACSSSNTGKLIF|TRBV4-1;TRBJ1-2;CA...,37,1,c0
20,c1038,;;,TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,;;|TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,1,1,c1041
20,c1038,;;,TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,;;|TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,1,1,c1045
20,c1038,;;,TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,;;|TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,1,1,c0
23,c1041,;;,TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,;;|TRBV20-1;TRBJ2-7;CSAISGTDLREQYF,1,1,c1038
...,...,...,...,...,...,...,...
3068,c984,;;,TRBV6-2;TRBJ2-1;CASSWDHNEQFF,;;|TRBV6-2;TRBJ2-1;CASSWDHNEQFF,1,1,c980
3068,c984,;;,TRBV6-2;TRBJ2-1;CASSWDHNEQFF,;;|TRBV6-2;TRBJ2-1;CASSWDHNEQFF,1,1,c982
3068,c984,;;,TRBV6-2;TRBJ2-1;CASSWDHNEQFF,;;|TRBV6-2;TRBJ2-1;CASSWDHNEQFF,1,1,c983
3072,c991,;;,TRBV30;TRBJ1-1;CAWSAYSESAEAFF,;;|TRBV30;TRBJ1-1;CAWSAYSESAEAFF,1,1,c992


In [53]:
tmp = pd.DataFrame(columns=['clonotype','pairs','clones'])

In [54]:
tmp['clonotype'] = ['c0']

In [55]:
# Count matches of clonotype 0 with the true clonotypes + itself
tmp['pairs'] =  [c10.chains.isin(c00.chains).astype(int).to_list() + [0]] #[c10.chain_b.isin(c00.chain_b).astype(int).to_list() + [0]]

In [56]:
# List the true clonotypes
tmp['clones'] = [c10.clonotype.to_list() + ['c0']]

In [57]:
tmp

Unnamed: 0,clonotype,pairs,clones
0,c0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[c1, c10, c100, c1000, c1005, c1006, c101, c10..."


In [58]:
c = pd.concat([c10,tmp], ignore_index=True)

In [59]:
m = c.explode(['pairs','clones'])
m

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs,clones
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c1
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c10
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c100
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c1000
0,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,255.0,0,c1005
...,...,...,...,...,...,...,...
3025,c0,,,,,0,c993
3025,c0,,,,,0,c996
3025,c0,,,,,0,c997
3025,c0,,,,,0,c998


In [61]:
m[(m.pairs > 0) & (m.clonotype.isin(['c10','c2471','c3001']))]

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem,pairs,clones
1,c10,TRAV24;TRAJ37;CACSSSNTGKLIF,TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,TRAV24;TRAJ37;CACSSSNTGKLIF|TRBV4-1;TRBJ1-2;CA...,37.0,1,c0
830,c2471,;;,TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,;;|TRBV4-1;TRBJ1-2;CASSQDRLTGGYTF,1.0,1,c0


In [43]:
df.loc[df.clonotype.isin(['clonotype10','clonotype2471','clonotype3001']), ['clonotype','ct','genes_lst_TRA','genes_lst_TRB','cdr3_TRA','cdr3_TRB']]

Unnamed: 0,clonotype,ct,genes_lst_TRA,genes_lst_TRB,cdr3_TRA,cdr3_TRB
363,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
448,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
762,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
901,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
917,clonotype10,10.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CACSSSNTGKLIF,CASSQDRLTGGYTF
...,...,...,...,...,...,...
6978,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF
7006,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF
7030,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF
7057,clonotype10,10.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF


In [42]:
df.loc[df.clonotype.isin(['clonotype10','clonotype2471','clonotype3001']), ['clonotype','ct','genes_lst_TRA','genes_lst_TRB','cdr3_TRA','cdr3_TRB']]

Unnamed: 0,clonotype,ct,genes_lst_TRA,genes_lst_TRB,cdr3_TRA,cdr3_TRB
4251,clonotype3001,3001.0,['TRAV24;TRAJ37;TRAC'],['TRBV4-1;;TRBJ1-2;TRBC1'],CARSSSNTGKLIF,CASSQDRLTGGYTF
6443,clonotype2471,2471.0,,['TRBV4-1;;TRBJ1-2;TRBC1'],,CASSQDRLTGGYTF


In [29]:
out = c.explode(['pairs','clones']).pivot(index='clonotype',columns='clones', values='pairs').replace(0, np.nan).reset_index().rename(columns={'clonotype':'from'})

In [316]:
#out.to_csv('arc_data.b.csv', index=False)

In [34]:
out.dropna(how='all')

clones,from,c0,c1,c10,c100,c1000,c1005,c1006,c101,c1011,...,c984,c987,c988,c99,c991,c992,c993,c996,c997,c998
0,c0,,1.0,1.0,,,,,,,...,,,,,,,,,,
1,c1,1.0,,,,,,,,,...,,,,,,,,,,
2,c10,1.0,,,,,,,,,...,,,,,,,,,,
3,c100,,,,,,,,,,...,,,,,,,,,,
4,c1000,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3021,c992,,,,,,,,,,...,,,,,1.0,,,,,
3022,c993,,,,,,,,,,...,,,,,,,,,,
3023,c996,,,,,,,,,,...,,,,,,,,,,
3024,c997,,,,,,,,,,...,,,,,,,,,,
