In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from ast import literal_eval
import seaborn as sns

In [2]:
import sys  
sys.path.insert(0, '../scripts')

from D_plot_specificity_matrix_utils import (peptide_per_clonotype_by_gem_size,
                                             multiple_peptides_per_gem_w_filtering,
                                             calc_binding_concordance,
                                             epitope_sorter_index,
                                             peptides_per_gem)

In [3]:
sns.set_style('ticks', {'axes.edgecolor': '0',  
                        'xtick.color': '0',
                        'ytick.color': '0'})

In [4]:
def HLA_cd8_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace(",", "").replace("'","").split(" ")

def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def epitope_converter(x):
    #define format of datetime
    return [y for y in x.replace("[","").replace("]","").replace("\n","").split("'") if (y != '') & (y != ' ')]

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("\n","").replace("'",""))

def literal_converter(val):
    # replace NaN with '' and perform literal eval on the rest
    return [] if val == '' else literal_eval(val)

converters = {'peptide_HLA_lst': peptide_hla_converter,
              'umi_count_lst_mhc': literal_eval,
              'umi_count_lst_TRA': literal_converter,'umi_count_lst_TRB': literal_converter,
              'cdr3_lst_TRA': cdr3_lst_converter,
              'cdr3_lst_TRB': cdr3_lst_converter,
              'HLA_lst_mhc': cdr3_lst_converter,'HLA_cd8': HLA_cd8_converter} #

In [5]:
def notnan(x):
    return x == x

In [6]:
def get_multiplets(df):
    #tmp = df[idx1 & idx2]
    dct = df.groupby(['ct','peptide_HLA']).gem.count() > 1
    idx = df.set_index(['ct','peptide_HLA']).index.map(dct)
    return idx.fillna(False)

# Input

In [7]:
VALID = '../experiments/exp13/run3/cat/eval_clonotypes/valid_ct.csv'
#OS2 = '../experiments/exp13/run2/cat/eval_clonotypes/valid_ct.csv'

# Load

In [92]:
df = pd.read_csv(VALID, converters=converters)

In [93]:
df.fillna({'umi_count_mhc':0, 'delta_umi_mhc':0, 'umi_count_mhc_rel':0,
           'umi_count_cd8':0, 'delta_umi_cd8':0,
           'umi_count_TRA':0, 'delta_umi_TRA':0,
           'umi_count_TRB':0, 'delta_umi_TRB':0,
           'cdr3_TRA':'','cdr3_TRB':''}, inplace=True)

# Only on most abundant chain

In [94]:
df.num_clonotype = 'c' + df.num_clonotype.astype(int).astype(str)

In [95]:
df['chain_a'] = df.v_gene_TRA.fillna('') + ";" + df.j_gene_TRA.fillna('') + ";" + df.cdr3_TRA.fillna('')
df['chain_b'] = df.v_gene_TRB.fillna('') + ";" + df.j_gene_TRB.fillna('') + ";" + df.cdr3_TRB.fillna('')
df['chains'] = df.chain_a + '|' + df.chain_b

In [96]:
c10 = df[df.num_clonotype!='c0'].copy()
c10 = c10.groupby(['num_clonotype','chain_a','chain_b','chains']).gem.size().reset_index()
c10.drop_duplicates(subset='num_clonotype', keep='last', inplace=True)
c10.rename(columns={'num_clonotype':'clonotype'}, inplace=True)
c10.head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
2,c1,TRAV30;TRAJ38;CGTEGAGNNRKLIW,TRBV3-1;TRBJ1-5;CASSQVFGTPTPQHF,TRAV30;TRAJ38;CGTEGAGNNRKLIW|TRBV3-1;TRBJ1-5;C...,252
3,c10,TRAV38-2/DV8;TRAJ54;CAYNQGAQKLVF,TRBV9;TRBJ1-3;CASSYPTGGTSGNTIYF,TRAV38-2/DV8;TRAJ54;CAYNQGAQKLVF|TRBV9;TRBJ1-3...,57
4,c100,TRAV17;TRAJ44;CATVASKLTF,TRBV7-9;TRBJ2-1;CASTPTGLGVDEQFF,TRAV17;TRAJ44;CATVASKLTF|TRBV7-9;TRBJ2-1;CASTP...,4
5,c1000,;;,TRBV27;TRBJ2-3;CASSFGPLTDTQYF,;;|TRBV27;TRBJ2-3;CASSFGPLTDTQYF,1
6,c1001,;;,TRBV5-6;TRBJ1-2;CASSSLVMGYGYTF,;;|TRBV5-6;TRBJ1-2;CASSSLVMGYGYTF,1


In [97]:
c00 = (df
       .dropna(subset=['genes_TRA','genes_TRB'])
       .groupby(['gem','chain_a','chain_b','chains'])
       .size().reset_index().rename(columns={'gem':'clonotype',0:'gem'}))
c00.head()

Unnamed: 0,clonotype,chain_a,chain_b,chains,gem
0,AAACCTGAGCCCAGCT-1,TRAV8-3;TRAJ10;CAVGVRGGGNKLTF,TRBV6-5;TRBJ1-2;CASSVTGPPREDGYTF,TRAV8-3;TRAJ10;CAVGVRGGGNKLTF|TRBV6-5;TRBJ1-2;...,1
1,AAACCTGAGTCAATAG-1,TRAV17;TRAJ38;CATFNAGNNRKLIW,TRBV19;TRBJ2-7;CASSLVAGGHEQYF,TRAV17;TRAJ38;CATFNAGNNRKLIW|TRBV19;TRBJ2-7;CA...,1
2,AAACCTGCAATCCGAT-1,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF,TRBV5-1;TRBJ2-5;CASSTPSSGPQETQYF,TRAV1-1;TRAJ10;CAVRAITGGGNKLTF|TRBV5-1;TRBJ2-5...,1
3,AAACCTGCAGCCAGAA-1,TRAV21;TRAJ33;CAVLMDSNYQLIW,TRBV10-2;TRBJ1-1;CASSADGMNTEAFF,TRAV21;TRAJ33;CAVLMDSNYQLIW|TRBV10-2;TRBJ1-1;C...,1
4,AAACCTGCATGCCACG-1,TRAV17;TRAJ12;CATVVRMDSSYKLIF,TRBV7-9;TRBJ2-1;CASSLIGQGKKDEQFF,TRAV17;TRAJ12;CATVVRMDSSYKLIF|TRBV7-9;TRBJ2-1;...,1


In [98]:
#c00 = (df[df.num_clonotype=='c0']
#       .groupby(['num_clonotype','chain_a','chain_b','chains']).gem.size()
#       .reset_index()
#       .rename(columns={'num_clonotype':'clonotype'}))
#c00

In [99]:
#c10 = pd.concat([c10,c00], ignore_index=True)

In [100]:
def get_alpha_pairs(row):
    return ((c10.chain_a.isin([row.chain_a]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_a.isin([row.chain_a])))])

In [101]:
def get_beta_pairs(row):
    return ((c10.chain_b.isin([row.chain_b]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chain_b.isin([row.chain_b])))])

In [102]:
def get_pairs(row):
    return ((c10.chains.isin([row.chains]) &
             (c10.clonotype != row.clonotype)).astype(int).to_list() +
            [int(any(c00.chains.isin([row.chains])))])

In [103]:
#c10['pairs'] = c10.apply(lambda row: get_pairs(row), axis=1)
#c10['pairs'] = c10.apply(lambda row: get_alpha_pairs(row), axis=1)
c10['pairs'] = c10.apply(lambda row: get_beta_pairs(row), axis=1)

In [104]:
c10['clones'] = c10.apply(lambda row: c10.clonotype.to_list() + ['c0'], axis=1)

# Something new

In [105]:
tmp = pd.DataFrame(columns=['clonotype','pairs','clones'])

In [106]:
tmp['clonotype'] = ['c0']

In [107]:
# Count matches of clonotype 0 with the true clonotypes + itself
#tmp['pairs'] =  [c10.chains.isin(c00.chains).astype(int).to_list() + [0]]
#tmp['pairs'] = [c10.chain_a.isin(c00.chain_a).astype(int).to_list() + [0]]
tmp['pairs'] = [c10.chain_b.isin(c00.chain_b).astype(int).to_list() + [0]]

In [108]:
# List the true clonotypes
tmp['clones'] = [c10.clonotype.to_list() + ['c0']]

In [109]:
tmp

Unnamed: 0,clonotype,pairs,clones
0,c0,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[c1, c10, c100, c1000, c1001, c1002, c1003, c1..."


In [110]:
c = pd.concat([c10,tmp], ignore_index=True)

In [111]:
out = (c
       .explode(['pairs','clones'])
       .pivot(index='clonotype',columns='clones', values='pairs')
       .replace(0, np.nan)
       .reset_index()
       .rename(columns={'clonotype':'from'}))

In [112]:
#out.dropna(how='all').to_csv('arc_data_publication/arc_data.c.csv', index=False)
#out.dropna(how='all').to_csv('arc_data_publication/arc_data.a.csv', index=False)
out.dropna(how='all').to_csv('arc_data_publication/arc_data.b.csv', index=False)

In [113]:
out.dropna(how='all')

clones,from,c0,c1,c10,c100,c1000,c1001,c1002,c1003,c1004,...,c990,c991,c992,c993,c994,c995,c996,c997,c998,c999
0,c0,,1.0,1.0,1.0,,,,,,...,,1.0,,,,,,,,
1,c1,1.0,,,,,,,,,...,,,,,,,,,,
2,c10,1.0,,,,,,,,,...,,,,,,,,,,
3,c100,1.0,,,,,,,,,...,,,,,,,,,,
4,c1000,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2194,c995,,,,,,,,,,...,,,,,,,,,,
2195,c996,,,,,,,,,,...,,,,,,,,,,
2196,c997,,,,,,,,,,...,,,,,,,,,,
2197,c998,,,,,,,,,,...,,,,,,,,,,
