In [1]:
import pandas as pd
import numpy as np
from scipy.io import mmread
import itertools
from functools import reduce

In [27]:
# ## Functions
def annotate_count(df, var): #umi_lst, read_lst
    return df[var].apply(len)

def annotate_lst(df, var): # template_id, epitope, peptide, peptide_HLA, HLA
    if all(df.applymap(type)[var] == list):
        #dct = df.groupby('gem')[var].apply(list).apply(lambda k: list(k for k,_ in itertools.groupby(k))).to_dict()
        dct = df.groupby('gem')[var].apply(list).apply(lambda x: list(k for k,_ in itertools.groupby(x)))
    else:
        dct = df.groupby('gem')[var].apply(list) #.unique()
    return df.gem.map(dct)

def annotate_pool(df):
    # var = HLA
    dct = df.groupby('gem').HLA.apply(lambda x: np.unique([z for y in x for z in y]))
    return df.gem.map(dct)

def annotate_delta_umi(df):
    def calc_delta(x):
        if len(x) == 1:
            return x[-1]/0.25
        elif len(x) == 0:
            return 0
        else:
            return (x[-1])/(x[-2]+0.25)
    return df.umi_count_lst.apply(calc_delta)

def annotate_detected_response(df):
    dct = response_df.groupby(['peptide','barcode_cd8']).apply(any).to_dict()
    return df.set_index(['peptide','barcode_cd8']).index.map(dct)

def annotate_peptide_assayed(df):
    return np.where(df.peptide.isin(response_df.peptide), True, False)

In [3]:
def get_likely_targets(row):
    from itertools import compress
    if (type(row.HLA_cd8) is list) & (type(row.HLA_lst_mhc) is list):
        chec = [item in row.HLA_cd8 for item in row.HLA_lst_mhc]
        idxs = list(compress(range(len(chec)), chec))
        if idxs == []:
            return np.nan
        else:
            return [row.HLA_lst_mhc[i] for i in idxs]
    else:
        return np.nan

# Input

In [4]:
BARCODES = '/Users/herpov/Documents/mnt/tuba_home/tcr-pmhc-sc-project/experiments/exp13/run2/tcr/cellranger_tot/outs/multi/count/raw_feature_bc_matrix/barcodes.tsv.gz'
FEATURES = '/Users/herpov/Documents/mnt/tuba_home/tcr-pmhc-sc-project/experiments/exp13/run2/tcr/cellranger_tot/outs/multi/count/raw_feature_bc_matrix/features.tsv.gz'
MATRIX = '/Users/herpov/Documents/mnt/tuba_home/tcr-pmhc-sc-project/experiments/exp13/run2/tcr/cellranger_tot/outs/multi/count/raw_feature_bc_matrix/matrix.mtx.gz'

In [5]:
BARCODE_LABELS = '/Users/herpov/Documents/mnt/tuba_home/tcr-pmhc-sc-project/experiments/exp13/run2/lib/barcode_specificity_annotations.xlsx'

In [6]:
response_annotations = '/Users/herpov/Documents/mnt/tuba_home/tcr-pmhc-sc-project/experiments/exp13/run2/lib/detected_responses_annotation.xlsx'

# Load

In [7]:
response_df = pd.read_excel(response_annotations, usecols=['barcode_cd8', 'peptide'])

In [8]:
labels = dict()
for label, sheet in zip(['mhc','hsh','mrk'],['MHC','HSH','MRK']):
    labels[label] = pd.read_excel(BARCODE_LABELS, sheet_name=sheet)
    
    if label == 'mhc':
        labels[label]['peptide'] = labels[label].peptide.str.strip().str.split("_", expand=True)[0]
        labels[label]['peptide_HLA'] = labels[label].peptide + ' ' + labels[label].HLA
        
    if label == 'hsh':
        labels[label].fillna('', inplace=True)
        labels[label]['HLA'] = labels[label]['HLA_A'] + ', ' + labels[label]['HLA_B'] + ', ' + labels[label]['HLA_C']
        labels[label]['HLA'] = labels[label]['HLA'].str.split(r',\s?').apply(lambda x: [i for i in x if i!= ''])
        
    if label == 'mrk':
        continue

In [11]:
g = np.loadtxt(BARCODES, dtype='U36')
b = pd.read_csv(FEATURES, sep='\t', header=None, names=['barcode','name','feature'])
m = mmread(MATRIX)

In [13]:
wide_df = pd.DataFrame(m.toarray(), index=b.barcode, columns=g).T

# Prep

In [14]:
wide_df.replace(0, np.nan, inplace=True)

In [16]:
long_df = wide_df.melt(value_vars=wide_df.columns, ignore_index=False)

In [17]:
long_df.reset_index(inplace=True)
long_df.rename(columns={'index':'gem','value':'umi_count'}, inplace=True)
long_df.dropna(inplace=True)

Unnamed: 0,gem,barcode,umi_count
55,AAACCTGAGAGCAATT-1,15bp_frac_1,1.0
225,AAACCTGAGGCATGGT-1,15bp_frac_1,1.0
272,AAACCTGAGGTGCTAG-1,15bp_frac_1,1.0
591,AAACCTGCAGCCAGAA-1,15bp_frac_1,19.0
668,AAACCTGCATAGACTC-1,15bp_frac_1,1.0
...,...,...,...
16643208,TTTGTCATCTTGTTTG-1,260,1.0
16643210,TTTGTCATCTTTACGT-1,260,20.0
16643211,TTTGTCATCTTTAGGG-1,260,2.0
16643212,TTTGTCATCTTTAGTC-1,260,16.0


In [32]:
dfs = dict()

long_df.barcode = long_df.barcode.astype(str)
for key, frame in labels.items():
    frame.barcode = frame.barcode.astype(str)
    
    df = pd.merge(long_df, frame, on='barcode')
    df.sort_values(by=['gem','umi_count'], inplace=True)
    
    df['umi_count_lst'] = annotate_lst(df, 'umi_count') #umi_lst    
    df['delta_umi'] = annotate_delta_umi(df)
    df['brc_count'] = df.umi_count_lst.apply(len) #brc_count  
    df['single_barcode'] = df.brc_count == 1 #brc_singlet
    df['multiplets'] = df.brc_count > 1 #brc_multiplet
    df['template_lst'] = annotate_lst(df, 'barcode') # barcode == template_id
    
    if key == 'mhc':
        df['epitope_lst'] = annotate_lst(df, 'epitope')
        df['peptide_lst'] = annotate_lst(df, 'peptide')
        df['peptide_HLA_lst'] = annotate_lst(df, 'peptide_HLA')
        df['HLA_lst'] = annotate_lst(df, 'HLA')
        df['HLA_pool'] = annotate_pool(df) # Not really necessary for mhc
        
    if key == 'hsh':
        df['sample_id_lst'] = annotate_lst(df, 'sample_id')
        df['HLA_lst'] = annotate_lst(df, 'HLA')
        df['HLA_pool'] = annotate_pool(df)
        
    if key == 'mrk':
        df['marker_lst'] = annotate_lst(df, 'marker')
        df.columns = [name if name in ['gem','marker','marker_lst'] else name + '_mrk' for name in df.columns] # Hack
        
    df.drop_duplicates(subset=['gem'], keep='last', inplace=True)
    dfs[key] = df
    #dfs = dfs.merge(df, on='gem', how='outer', suffixes=('','_%s' % ('cd8' if key == 'hsh' else key)))
    
df = (dfs['mhc']
      .merge(dfs['hsh'], on='gem', how='outer', suffixes=('_mhc','_cd8')) #_hsh
      .merge(dfs['mrk'], on='gem', how='outer', suffixes=('','_mrk')))
    

In [33]:
# ## Check that annotated peptide HLA matches CDX HLA annotation
df['HLA_match'] = df.apply(lambda row: row.HLA_mhc in row.HLA_cd8 if (row.HLA_mhc==row.HLA_mhc) & (type(row.HLA_cd8) == list) else np.nan, axis=1)
df['likely_HLA_mhc'] = df.apply(lambda row: get_likely_targets(row), axis=1)

In [34]:
df['detected_response'] = annotate_detected_response(df)
df['peptide_assayed'] = annotate_peptide_assayed(df)

In [35]:
df

Unnamed: 0,gem,barcode_mhc,umi_count_mhc,peptide,HLA_mhc,epitope,rank,comment_mhc,peptide_HLA,umi_count_lst_mhc,...,delta_umi_mrk,brc_count_mrk,single_barcode_mrk,multiplets_mrk,template_lst_mrk,marker_lst,HLA_match,likely_HLA_mhc,detected_response,peptide_assayed
0,AAACCTGAGAATTCCC-1,15bp_frac_5,1.0,RVRAYTYSK,A0301,v25,5.0,,RVRAYTYSK A0301,[1.0],...,,,,,,,True,[A0301],,False
1,AAACCTGAGACACTAA-1,15bp_frac_5,3.0,RVRAYTYSK,A0301,v25,5.0,,RVRAYTYSK A0301,[3.0],...,,,,,,,True,[A0301],,False
2,AAACCTGAGACAGGCT-1,15bp_frac_2,1.0,FLYALALLL,A0201,v10,4.0,,FLYALALLL A0201,[1.0],...,,,,,,,True,[A0201],,True
3,AAACCTGAGACATAAC-1,15bp_frac_5,2.0,RVRAYTYSK,A0301,v25,5.0,,RVRAYTYSK A0301,"[1.0, 1.0, 2.0]",...,,,,,,,True,"[A0201, A0301]",,False
4,AAACCTGAGAGCAATT-1,15bp_frac_1,1.0,CLGGLLTMV,A0201,v5,4.0,,CLGGLLTMV A0201,[1.0],...,,,,,,,True,[A0201],,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722134,TTTGTCATCTTTACAC-1,,,,,,,,,,...,,,,,,,,,,False
722135,TTTGTCATCTTTACGT-1,,,,,,,,,,...,,,,,,,,,,False
722136,TTTGTCATCTTTAGGG-1,,,,,,,,,,...,,,,,,,,,,False
722137,TTTGTCATCTTTAGTC-1,,,,,,,,,,...,,,,,,,,,,False


In [37]:
# ## Write data
new_column_order = ['gem',
 #'template_id_mhc',
 #'template_lst_mhc',
 'umi_count_mhc',
 'umi_count_lst_mhc',
 'delta_umi_mhc',
 #'read_count_mhc',
 #'read_count_lst_mhc',
 'single_barcode_mhc',
 'multiplets_mhc',
 #'template_id_cd8',
 #'template_lst_cd8',
 'umi_count_cd8',
 'umi_count_lst_cd8',
 'delta_umi_cd8',
 'single_barcode_cd8',
 'multiplets_cd8',
 #'read_count_cd8',
 #'read_count_lst_cd8',
 'detected_response',
 'peptide_assayed',
 'sample_id',
 'sample_id_lst',
 'HLA_pool_cd8',
 'HLA_lst_cd8',
 'HLA_cd8',
 'HLA_match',
 'HLA_mhc',
 'HLA_lst_mhc',
 'likely_HLA_mhc',
 'peptide',
 'peptide_lst',
 'peptide_HLA',
 'peptide_HLA_lst',
 'epitope',
 'epitope_lst',
 'rank',
 'marker',
 'marker_lst',
 'umi_count_lst_mrk',
 'delta_umi_mrk',
 'single_barcode_mrk',
 'multiplets_mrk'] #+ specificity_matrix.columns.to_list()

df[new_column_order]#.to_csv(output, index=False)

Unnamed: 0,gem,umi_count_mhc,umi_count_lst_mhc,delta_umi_mhc,single_barcode_mhc,multiplets_mhc,umi_count_cd8,umi_count_lst_cd8,delta_umi_cd8,single_barcode_cd8,...,peptide_HLA_lst,epitope,epitope_lst,rank,marker,marker_lst,umi_count_lst_mrk,delta_umi_mrk,single_barcode_mrk,multiplets_mrk
0,AAACCTGAGAATTCCC-1,1.0,[1.0],4.0,True,False,2.0,[2.0],8.000000,True,...,[RVRAYTYSK A0301],v25,[v25],5.0,,,,,,
1,AAACCTGAGACACTAA-1,3.0,[3.0],12.0,True,False,24.0,"[1.0, 1.0, 3.0, 3.0, 4.0, 4.0, 5.0, 11.0, 15.0...",1.573770,False,...,[RVRAYTYSK A0301],v25,[v25],5.0,,,,,,
2,AAACCTGAGACAGGCT-1,1.0,[1.0],4.0,True,False,20.0,"[2.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 9.0, 20.0]",2.162162,False,...,[FLYALALLL A0201],v10,[v10],4.0,,,,,,
3,AAACCTGAGACATAAC-1,2.0,"[1.0, 1.0, 2.0]",1.6,False,True,925.0,"[1.0, 2.0, 2.0, 4.0, 5.0, 6.0, 10.0, 925.0]",90.243902,False,...,"[YVLDHLIVV A0201, TPRVTGGGAM B0702, RVRAYTYSK ...",v25,"[v16, v14, v25]",5.0,,,,,,
4,AAACCTGAGAGCAATT-1,1.0,[1.0],4.0,True,False,7.0,"[1.0, 1.0, 1.0, 3.0, 3.0, 7.0]",2.153846,False,...,[CLGGLLTMV A0201],v5,[v5],4.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722134,TTTGTCATCTTTACAC-1,,,,,,5.0,"[1.0, 2.0, 2.0, 2.0, 2.0, 5.0]",2.222222,False,...,,,,,,,,,,
722135,TTTGTCATCTTTACGT-1,,,,,,35.0,"[1.0, 2.0, 2.0, 3.0, 4.0, 4.0, 5.0, 20.0, 25.0...",1.386139,False,...,,,,,,,,,,
722136,TTTGTCATCTTTAGGG-1,,,,,,3.0,"[1.0, 2.0, 2.0, 3.0]",1.333333,False,...,,,,,,,,,,
722137,TTTGTCATCTTTAGTC-1,,,,,,16.0,"[2.0, 2.0, 3.0, 6.0, 7.0, 11.0, 11.0, 15.0, 16.0]",1.049180,False,...,,,,,,,,,,
