In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
import re

In [2]:
def cdr3_lst_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def peptide_hla_converter(x):
    return re.findall("\S+\s{1}\S+", x.replace("[","").replace("]","").replace("'",""))

def literal_converter(val):
    try:
        return literal_eval(val)
    except SyntaxError:
        return [0]
    except ValueError:
        return [0]

converters = {'umi_count_lst_mhc': literal_converter,
              'template_lst_mhc': cdr3_lst_converter,
              'peptide_HLA_lst': peptide_hla_converter}

# Args

# Inputs

In [3]:
TCR_FILE = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp3_TCR/processed/cellranger_out/TCR_VDJ/outs/all_contig_annotations.csv"

In [4]:
BC_FILE = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp3_MHC_IONTORRENT/mapping/KMA-1t1/output/mapping.clean.AKB.augmented.gz" #

# Load

In [5]:
tcr_df = pd.read_csv(TCR_FILE)
tcr_df.rename(columns={'barcode':'gem'}, inplace=True)

In [6]:
bc_df = pd.read_csv(BC_FILE, converters=converters)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
bc_df

Unnamed: 0,gem,template_id_mhc,template_lst_mhc,single_barcode_mhc,umi_count_mhc,umi_count_lst_mhc,read_count_mhc,read_count_lst_mhc,multiplets_mhc,delta_umi_mhc,...,v38,v39,v4,v40,v41,v5,v6,v7,v8,v9
0,AAACCTGAGGAACTGC-1,A1067B297_sample,[A1067B297_sample],True,4.0,[4],78.0,[78],1.0,100.0,...,,,,,,,,,,
1,AAACCTGAGGTCATCT-1,A1064B288_sample,"[A1065B288_sample, A1064B288_sample]",False,1.0,"[1, 1]",1.0,"[48, 1]",2.0,0.0,...,,,,,,,,,,1.0
2,AAACCTGAGTCATCCA-1,A1071B288_sample,[A1071B288_sample],True,3.0,[3],33.0,[33],1.0,100.0,...,,,,,,,,,,
3,AAACCTGAGTTCGATC-1,A1064B288_sample,[A1064B288_sample],True,2.0,[2],15.0,[15],1.0,100.0,...,,,,,,,,,,2.0
4,AAACCTGCAAGCCATT-1,A1071B292_sample,[A1071B292_sample],True,4.0,[4],56.0,[56],1.0,100.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7586,TTGACTTTCATGCTCC-1,,[],,,[0],,,,,...,,,,,,,,,,
7587,TTGCGTCTCTCTAAGG-1,,[],,,[0],,,,,...,,,,,,,,,,
7588,TTTATGCAGCCTCGTG-1,,[],,,[0],,,,,...,,,,,,,,,,
7589,TTTGGTTTCTATCCTA-1,,[],,,[0],,,,,...,,,,,,,,,,


In [8]:
bc_df[bc_df.gem == 'TTTGTCAGTCTAGAGG-1'][['template_lst_mhc', 'umi_count_lst_mhc', 'peptide_HLA_lst']]

Unnamed: 0,template_lst_mhc,umi_count_lst_mhc,peptide_HLA_lst
6897,"[A1070B302_sample, A1068B293_sample]","[1, 5]","[p1.a1 p*B0801, RLVVAVEEA A0201]"


# Main

In [9]:
def flatten_column(df):
    '''
    column is a string of the column's name.
    for each value of the column's element (which might be a list),
    duplicate the rest of columns at the corresponding row with the (each) value.
    '''
    column_flat = pd.DataFrame([[i, bc_df.loc[i, 'peptide_HLA_lst'][lst_element], bc_df.loc[i, 'umi_count_lst_mhc'][lst_element]]
                                for i, lst in df['peptide_HLA_lst'].iteritems()
                                for lst_element in range(len(lst))],
                               columns=['original_index', 'peptide_HLA_lst', 'umi_count_lst_mhc'])
    
    return column_flat.set_index('original_index')
    #return df.drop(column, axis=1).merge(column_flat, left_index=True, right_index=True)

In [10]:
flat_df = flatten_column(bc_df)
flat_df

Unnamed: 0_level_0,peptide_HLA_lst,umi_count_lst_mhc
original_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,KTWGQYWQV A0201,4
1,VTEHDTLLY A0101,1
1,YSEHPTFTSQY A0101,1
2,NLVPMVATV A0201,3
3,YSEHPTFTSQY A0101,2
...,...,...
6896,YSEHPTFTSQY A0101,1
6897,p1.a1 p*B0801,1
6897,RLVVAVEEA A0201,5
6898,NLVPMVATV A0201,1


In [11]:
bc_df_sub = pd.merge(bc_df['gem'], flat_df, left_index=True, right_index=True)

In [12]:
peptide_hla = bc_df_sub.pivot(index='gem', columns='peptide_HLA_lst', values='umi_count_lst_mhc')

In [13]:
v_gene = tcr_df.pivot_table(index='gem', columns='v_gene', values='umis', aggfunc=np.mean, fill_value=0).drop(columns='None')
d_gene = tcr_df.pivot_table(index='gem', columns='d_gene', values='umis', aggfunc=np.mean, fill_value=0).drop(columns='None')
j_gene = tcr_df.pivot_table(index='gem', columns='j_gene', values='umis', aggfunc=np.mean, fill_value=0).drop(columns='None')
c_gene = tcr_df.pivot_table(index='gem', columns='c_gene', values='umis', aggfunc=np.mean, fill_value=0).drop(columns='None')

In [14]:
count_matrix = (pd.merge(peptide_hla, v_gene, on='gem', how='outer')
                .merge(d_gene, on='gem', how='outer')
                .merge(j_gene, on='gem', how='outer')
                .merge(c_gene, on='gem', how='outer'))

count_matrix.fillna(0, inplace=True)

In [15]:
count_matrix

Unnamed: 0_level_0,A0101 GPISGHVLK,A0101 RLLASLQDL,A0201 NLVPMVATV,A0201 QIDVSQFGSY,A0201 RAKFKQLL,A0201 RLRAEAQVK,A0201 SLAAYIPRL,A0201 VLEETSVML,A0201 VTEHDTLLY,A0201 YSEHPTFTSQY,...,TRBJ2-2P,TRBJ2-3,TRBJ2-4,TRBJ2-5,TRBJ2-6,TRBJ2-7,TRGJP1,TRAC,TRBC1,TRBC2
gem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGGAACTGC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGGTCATCT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
AAACCTGAGTCATCCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGTTCGATC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,5.0
AAACCTGCAAGCCATT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCGGAGGTA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
TTTGTCATCTACTATC-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
TTTGTCATCTACTCAT-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
TTTGTCATCTGCGGCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0


In [16]:
count_matrix.to_csv('count_matrix.csv.gz', index=True)

In [17]:
count_matrix.isna().any(axis=1).any()

False