# OBS! Is now a python script

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
plt.style.use('ggplot')

# Args

In [3]:
EXP = "exp3"
PLATFORM = "IONTORRENT"

In [4]:
MAPPING = 'KMA' # BLAST
BARCODE_SYSTEM = 'AKB' #'AKB' #10x

In [5]:
if BARCODE_SYSTEM == '10x':
    BARCODE_SYSTEM_REGEX = "^(?!.*A\d+B\d+).*$"
    ANTIBODY_REGEX = "HASH"
if BARCODE_SYSTEM == 'AKB':
    BARCODE_SYSTEM_REGEX = "^A\d+B\d+"
    ANTIBODY_REGEX = "A4000"

## Input data

In [6]:
if MAPPING == 'KMA':
    map_file = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_MHC_" + PLATFORM + "/mapping/KMA-1t1/output/mapping.clean.gz"
if MAPPING == 'BLAST':
    map_file = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_MHC_" + PLATFORM + "/mapping/blast/blast.annotated.clean.tsv"

OBS! We wont always have response data?!

In [7]:
specificity_annotations = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp3_MHC_" + PLATFORM + "/barcode_library/barcode_specificity_annotations.tab"
response_annotations = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp3_MHC_IONTORRENT/barcode_library/detected_responses_annotation.xlsx"
umi_annotations = "/Volumes/tuba/kamilla/10x-barcoding/results/kma_parser_parallel.tsv"

## Output data

In [8]:
if MAPPING == 'KMA':
    output = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_MHC_" + PLATFORM + "/mapping/KMA-1t1/output/mapping.clean." + BARCODE_SYSTEM + ".augmented.gz"
if MAPPING == 'BLAST':
    output = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_MHC_" + PLATFORM + "/mapping/blast/blast.annotated.clean." + BARCODE_SYSTEM + ".augmented.tsv"

In [8]:
EXP

'exp3'

# Import input

In [9]:
map_df = pd.read_csv(map_file) #, usecols=['query_id', 'template_id', 'gem', 'bit_score', 'alignment_length', 'tso', 'b_primer', 'anneal', 'a_primer', 'match'], sep=" ", names=["read_id", "gem", "tso", "b_primer", "anneal", "cd8_primer", "mhc_primer"]

In [10]:
specificity_df = pd.read_csv(specificity_annotations, sep='\t', skiprows=1, names=['barcode', 'peptide', 'HLA', 'epitope'])

In [11]:
response_df = pd.read_excel(response_annotations, index_col=None, usecols=['barcode_cd8', 'peptide'])

In [12]:
umi_df = pd.read_csv(umi_annotations, sep='\t', usecols=['read', 'A_N6', 'B_N6'])
umi_df.rename(columns={'read':'query_id'}, inplace=True)
umi_df.fillna('', inplace=True)
umi_df = umi_df[(umi_df.A_N6.apply(lambda x: len(str(x))==6)) & (umi_df.B_N6.apply(lambda x: len(str(x))==6))]
umi_df['umi'] = umi_df.A_N6 + umi_df.B_N6

In [13]:
umi_df

Unnamed: 0,query_id,A_N6,B_N6,umi
1,HBBAF:03366:00950,AGTGGA,TCGTTG,AGTGGATCGTTG
6,HBBAF:02685:00249,AGTGGA,TCGTTG,AGTGGATCGTTG
15,HBBAF:01635:00466,AGTGGA,TCGTTG,AGTGGATCGTTG
23,HBBAF:00443:01340,AGTGGA,TCGTTG,AGTGGATCGTTG
24,HBBAF:00382:01540,AGTGGA,TCGTTG,AGTGGATCGTTG
...,...,...,...,...
247875,HBBAF:00621:02112,CTAGGC,ATATAA,CTAGGCATATAA
247876,HBBAF:00373:00554,GAGTGT,ATATAA,GAGTGTATATAA
247878,HBBAF:02655:02122,GTCCTT,ACTATA,GTCCTTACTATA
247879,HBBAF:01027:02674,GTCCTT,ACTATA,GTCCTTACTATA


## Process mapping table

In [14]:
print("Reads: %i" %map_df.shape[0])
print("GEMs: %i" %map_df.gem.unique().shape[0])

Reads: 260419
GEMs: 8762


In [15]:
map_df = map_df[(map_df.credible_alignment == True) & (map_df.barcode.str.contains(BARCODE_SYSTEM_REGEX))]

In [16]:
print("Reads: %i" %map_df.shape[0])
print("GEMs: %i" %map_df.gem.unique().shape[0])

Reads: 245015
GEMs: 7876


In [17]:
map_df = pd.merge(map_df, umi_df[['query_id', 'umi']], on='query_id', how='inner')

In [18]:
print("Reads: %i" %map_df.shape[0])
print("GEMs: %i" %map_df.gem.unique().shape[0])

Reads: 227211
GEMs: 7591


In [19]:
map_df.barcode.isna().sum()

0

In [20]:
map_df.umi.isna().sum()

0

At this point I have a table with multiple lines per GEM: each line corresponds to the best annotated read. The annotation of reads may agree on the same barcode or may disagree. Later I will count the number of reads/UMIs for each barcode and only present the barcode with most reads/UMIs. 

Reads are filtered so that only reads with full length UMI (12 bp) are represented. Thus approximately 18.000 reads were removed of which 17.000 were due to lack of complete UMI.

## Partition into MHC and sample identifier barcodes

cd8_df = map_df[map_df.template_id.str.contains(ANTIBODY_REGEX, na = False)] 
mhc_df = map_df[~map_df.template_id.str.contains(ANTIBODY_REGEX, na = False)]

## Annotate specificities

mhc_df = pd.merge(mhc_df, specificity_df[['barcode','epitope']], how='left', on='barcode')

## Annotate UMI counts

In [21]:
specificity_df['peptide'] = specificity_df.peptide.str.split("_", expand=True)[0]
specificity_df['peptide_HLA'] = specificity_df.peptide + ' ' + specificity_df.HLA #.str.split('_').str[0]

In [22]:
specificity_df

Unnamed: 0,barcode,peptide,HLA,epitope,peptide_HLA
0,A1064B288,YSEHPTFTSQY,A0101,v9,YSEHPTFTSQY A0101
1,A1065B288,VTEHDTLLY,A0101,v15,VTEHDTLLY A0101
2,A1066B288,VSDGGPNLY,A0101,v19,VSDGGPNLY A0101
3,A1067B288,GILGFVFTL,A0201,v3,GILGFVFTL A0201
4,A1068B288,CLGGLLTMV,A0201,v5,CLGGLLTMV A0201
...,...,...,...,...,...
115,A1067B302,p1.a1,p*A1101,0,p1.a1 p*A1101
116,A1068B302,p1.a1,p*B0702,0,p1.a1 p*B0702
117,A1069B302,p1.a1,p*A2402,0,p1.a1 p*A2402
118,A1070B302,p1.a1,p*B0801,0,p1.a1 p*B0801


In [24]:
def annotate_lst(var): # umi, query_id
    dct = map_df.groupby(['gem', 'template_id'])[var].unique().to_dict()
    return map_df.set_index(['gem', 'template_id']).index.map(dct)

def annotate_count(var): #umi_lst, read_lst
    return map_df[var].apply(lambda x: len(x))

def annotate_template_lst(df):
    dct = df.groupby(['gem']).template_id.unique().to_dict()
    return df.gem.map(dct)

def annotate_count_lst(df, var): #umi_count, read_count
    dct = df.drop_duplicates(subset=['gem','template_id']).groupby(['gem'])[var].apply(list).to_dict()
    return df.gem.map(dct)

def annotate_single_barcode(df):
    return df.umi_count_lst.apply(lambda x: True if len(x)==1 else False)

def annotate_specificities():
    #dct = dict(zip(specificity_df.barcode, specificity_df.epitope))
    #return df.barcode.map(dct)
    return pd.merge(mhc_df, specificity_df, how='left', on='barcode')

def annotate_epitope_lst(df):
    dct = df.groupby(['gem']).epitope.unique().to_dict()
    return df.gem.map(dct)

def annotate_specificity_lst(df):
    dct = df.groupby(['gem']).peptide_HLA.unique().to_dict()
    return df.gem.map(dct)   

In [23]:
# ## Functions
def annotate_lst_per_template(var): # umi, query_id
    dct = map_df.groupby(['gem', 'template_id'])[var].unique().to_dict()
    return map_df.set_index(['gem', 'template_id']).index.map(dct)

def annotate_count(df, var): #umi_lst, read_lst
    return df[var].apply(len)

#def annotate_count(var): #umi_lst, read_lst
#    return map_df[var].apply(lambda x: len(x))

def annotate_lst(df, var): # template_id, epitope, peptide, peptide_HLA
    dct = df.groupby('gem')[var].unique().to_dict()
    return df.gem.map(dct)

#def annotate_template_lst(df):
#    dct = df.groupby(['gem']).template_id.unique().to_dict()
#    return df.gem.map(dct)

def annotate_count_lst(df, var): #umi_count, read_count
    dct = df.drop_duplicates(subset=['gem','template_id']).groupby(['gem'])[var].apply(list).to_dict()
    return df.gem.map(dct)

def annotate_single_barcode(df):
    return df.umi_count_lst.apply(lambda x: True if len(x)==1 else False)

def annotate_specificities():
    return pd.merge(mhc_df, specificity_df, how='left', on='barcode')

In [25]:
# ## Annotate barcode UMI and read counts
map_df['umi_lst'] = annotate_lst_per_template('umi')
map_df['umi_count'] = annotate_count(map_df, 'umi_lst')

cd8_df = map_df[map_df.template_id.str.contains(ANTIBODY_REGEX, na = False)].copy()
mhc_df = map_df[~map_df.template_id.str.contains(ANTIBODY_REGEX, na = False)].copy()

mhc_df.sort_values(by=['gem','umi_count','score','alignment_length'], inplace=True)
cd8_df.sort_values(by=['gem','umi_count','score','alignment_length'], inplace=True)

cd8_df['template_lst'] = annotate_lst(cd8_df, 'template_id') #annotate_template_lst(cd8_df)
mhc_df['template_lst'] = annotate_lst(mhc_df, 'template_id') #annotate_template_lst(mhc_df)

cd8_df['umi_count_lst'] = annotate_count_lst(cd8_df, 'umi_count')
mhc_df['umi_count_lst'] = annotate_count_lst(mhc_df, 'umi_count')

mhc_df = annotate_specificities()
mhc_df['epitope_lst'] = annotate_lst(mhc_df, 'epitope') #annotate_epitope_lst(mhc_df)
mhc_df['peptide_lst'] = annotate_lst(mhc_df, 'peptide') #annotate_peptide_lst(mhc_df)
mhc_df['peptide_HLA_lst'] = annotate_lst(mhc_df, 'peptide_HLA') #annotate_specificity_lst(mhc_df)

mhc_df[mhc_df.gem == 'TTTGTCAGTCTAGAGG-1']

Unnamed: 0,uncertainty,score,t_alignment_start,t_alignment_end,template_id,read_header,query_id,gem,credible_alignment,alignment_length,...,umi_count,template_lst,umi_count_lst,peptide,HLA,epitope,peptide_HLA,epitope_lst,peptide_lst,peptide_HLA_lst
199231,1,58,40,135,A1070B302_sample,HBBAF:00842:01387 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:00842:01387,TTTGTCAGTCTAGAGG-1,True,95,...,1,"[A1070B302_sample, A1068B293_sample]","[1, 5]",p1.a1,p*B0801,0,p1.a1 p*B0801,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
199232,1,63,40,137,A1070B302_sample,HBBAF:03621:01357 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:03621:01357,TTTGTCAGTCTAGAGG-1,True,97,...,1,"[A1070B302_sample, A1068B293_sample]","[1, 5]",p1.a1,p*B0801,0,p1.a1 p*B0801,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
199233,1,63,40,137,A1070B302_sample,HBBAF:03023:01681 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:03023:01681,TTTGTCAGTCTAGAGG-1,True,97,...,1,"[A1070B302_sample, A1068B293_sample]","[1, 5]",p1.a1,p*B0801,0,p1.a1 p*B0801,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
199234,1,63,40,137,A1070B302_sample,HBBAF:02916:02426 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:02916:02426,TTTGTCAGTCTAGAGG-1,True,97,...,1,"[A1070B302_sample, A1068B293_sample]","[1, 5]",p1.a1,p*B0801,0,p1.a1 p*B0801,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
199235,1,63,40,137,A1070B302_sample,HBBAF:02447:00478 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:02447:00478,TTTGTCAGTCTAGAGG-1,True,97,...,1,"[A1070B302_sample, A1068B293_sample]","[1, 5]",p1.a1,p*B0801,0,p1.a1 p*B0801,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199298,1,97,0,137,A1068B293_sample,HBBAF:00388:02054 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:00388:02054,TTTGTCAGTCTAGAGG-1,True,137,...,5,"[A1070B302_sample, A1068B293_sample]","[1, 5]",RLVVAVEEA,A0201,40 (C16)neo37,RLVVAVEEA A0201,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
199299,1,97,0,137,A1068B293_sample,HBBAF:00374:00875 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:00374:00875,TTTGTCAGTCTAGAGG-1,True,137,...,5,"[A1070B302_sample, A1068B293_sample]","[1, 5]",RLVVAVEEA,A0201,40 (C16)neo37,RLVVAVEEA A0201,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
199300,1,97,0,137,A1068B293_sample,HBBAF:00147:00727 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:00147:00727,TTTGTCAGTCTAGAGG-1,True,137,...,5,"[A1070B302_sample, A1068B293_sample]","[1, 5]",RLVVAVEEA,A0201,40 (C16)neo37,RLVVAVEEA A0201,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"
199301,1,100,0,137,A1068B293_sample,HBBAF:02391:02677 BX:Z:TTTGTCAGTCTAGAGG-1,HBBAF:02391:02677,TTTGTCAGTCTAGAGG-1,True,137,...,5,"[A1070B302_sample, A1068B293_sample]","[1, 5]",RLVVAVEEA,A0201,40 (C16)neo37,RLVVAVEEA A0201,"[0, 40 (C16)neo37]","[p1.a1, RLVVAVEEA]","[p1.a1 p*B0801, RLVVAVEEA A0201]"


In [52]:
map_df['umi_lst'] = annotate_lst('umi')
map_df['umi_count'] = annotate_count('umi_lst')

map_df['read_lst'] = annotate_lst('query_id')
map_df['read_count'] = annotate_count('read_lst')

cd8_df = map_df[map_df.template_id.str.contains(ANTIBODY_REGEX, na = False)].copy()
mhc_df = map_df[~map_df.template_id.str.contains(ANTIBODY_REGEX, na = False)].copy()

mhc_df.sort_values(by=['gem','umi_count','score','alignment_length'], inplace=True)
cd8_df.sort_values(by=['gem','umi_count','score','alignment_length'], inplace=True)

cd8_df['template_lst'] = annotate_template_lst(cd8_df)
mhc_df['template_lst'] = annotate_template_lst(mhc_df)

cd8_df['umi_count_lst'] = annotate_count_lst(cd8_df, 'umi_count')
mhc_df['umi_count_lst'] = annotate_count_lst(mhc_df, 'umi_count')

cd8_df['read_count_lst'] = annotate_count_lst(cd8_df, 'read_count')
mhc_df['read_count_lst'] = annotate_count_lst(mhc_df, 'read_count')

cd8_df['single_barcode'] = annotate_single_barcode(cd8_df)
mhc_df['single_barcode'] = annotate_single_barcode(mhc_df)

#mhc_df['epitope'] = annotate_specificities(mhc_df)
mhc_df = annotate_specificities()
mhc_df['epitope_lst'] = annotate_epitope_lst(mhc_df)
mhc_df['peptide_HLA_lst'] = annotate_specificity_lst(mhc_df)

In [53]:
print("Reads: %i" %map_df.shape[0])
print("GEMs: %i" %map_df.gem.unique().shape[0])

print("Reads: %i" %mhc_df.shape[0])
print("GEMs: %i" %mhc_df.gem.unique().shape[0])

print("Reads: %i" %cd8_df.shape[0])
print("GEMs: %i" %cd8_df.gem.unique().shape[0])

Reads: 227211
GEMs: 7591
Reads: 199334
GEMs: 6900
Reads: 27877
GEMs: 3434


In [54]:
227211 == (199334+27877)

True

In [55]:
3434-(7591-6900)

2743

## Collapse into 1 GEM per line

In [54]:
unique_cd8_df = cd8_df.drop_duplicates(subset=['gem'], keep='last')
unique_mhc_df = mhc_df.drop_duplicates(subset=['gem'], keep='last')

In [55]:
print("Reads: %i" %unique_mhc_df.shape[0])
print("GEMs: %i" %unique_mhc_df.gem.unique().shape[0])

print("Reads: %i" %unique_cd8_df.shape[0])
print("GEMs: %i" %unique_cd8_df.gem.unique().shape[0])

Reads: 6900
GEMs: 6900
Reads: 3434
GEMs: 3434


## Merge

In [56]:
barcode_df = pd.merge(unique_mhc_df[['gem', 'template_id', 'template_lst', 'barcode', 'sample', 'umi_count', 'umi_count_lst', 'read_count', 'read_count_lst', 'single_barcode', 'epitope_lst']],
                      unique_cd8_df[['gem', 'template_id', 'template_lst', 'barcode', 'sample', 'umi_count', 'umi_count_lst', 'read_count', 'read_count_lst', 'single_barcode']],
                      how='outer', on='gem', suffixes=('_mhc', '_cd8'))

In [57]:
print("Reads: %i" %barcode_df.shape[0])
print("GEMs: %i" %barcode_df.gem.unique().shape[0])

Reads: 7591
GEMs: 7591


## Specificity matrix

In [58]:
specificity_matrix = unique_mhc_df.pivot(index='gem', columns='epitope', values='umi_count')

## Response df

In [59]:
response_df.sort_values(by=['peptide','barcode_cd8'], inplace=True)
response_df.drop_duplicates(inplace=True)
response_df['detected_response'] = True

## Merge

In [60]:
barcode_specificity_df = pd.merge(barcode_df[['gem',
                                              'template_id_mhc', 'barcode_mhc', 'sample_mhc', 'umi_count_mhc', 'single_barcode_mhc', 'umi_count_lst_mhc', 'read_count_lst_mhc', 'template_lst_mhc', 'epitope_lst',
                                              'template_id_cd8', 'barcode_cd8', 'sample_cd8', 'umi_count_cd8', 'single_barcode_cd8', 'umi_count_lst_cd8', 'read_count_lst_cd8', 'template_lst_cd8']],
                                  specificity_df,
                                  how='left',
                                  left_on='barcode_mhc',
                                  right_on='barcode').merge(response_df,
                                                            how='left',
                                                            on=['barcode_cd8', 'peptide']).merge(specificity_matrix, how='left', on='gem') # 'match_mhc','match_cd8', 


In [61]:
barcode_specificity_df['peptide_assayed'] = np.where(barcode_specificity_df.peptide.isin(response_df.peptide), True, False)

In [62]:
print("Reads: %i" %barcode_specificity_df.shape[0])
print("GEMs: %i" %barcode_specificity_df.gem.unique().shape[0])

Reads: 7591
GEMs: 7591


In [69]:
specificity_df['peptide'] = specificity_df.peptide.str.split("_", expand=True)[0]

# Write data

In [None]:
new_column_order = ['gem',
 'template_id_mhc',
 'umi_count_mhc',
 'single_barcode_mhc',
 'umi_count_lst_mhc',
 'read_count_lst_mhc',
 'template_lst_mhc',
 'template_id_cd8',
 'umi_count_cd8',
 'single_barcode_cd8',
 'umi_count_lst_mhc',
 'read_counts_lst_cd8',
 'template_lst_cd8',
 'detected_response',
 'peptide_assayed',
 'peptide',
 'HLA',
 'epitope',
 'epitope_lst'] + specificity_matrix.columns.to_list()

In [None]:
barcode_specificity_df[new_column_order] #.to_csv(output, index=False, sep='\t')

## Annotate UMI counts (archived)

mhc_umi_df = mhc_df.groupby(['gem', 'template_id'])['umi'].unique().to_frame().reset_index().fillna('')
cd8_umi_df = cd8_df.groupby(['gem', 'template_id'])['umi'].unique().to_frame().reset_index().fillna('')

mhc_umi_df['umi_count'] = mhc_umi_df.umi.apply(lambda x: len(x))
cd8_umi_df['umi_count'] = cd8_umi_df.umi.apply(lambda x: len(x))

def avg_umi_len(x):
    umis = list()
    
    for umi_seq in x:
        if type(umi_seq) == str:
            umis.append(len(umi_seq))
            
    if len(umis) == 0:
        return 0
    else:
        return sum(umis)/len(umis)


mhc_umi_df['avg_umi_len'] = mch_umi_df.umi.apply(avg_umi_len)
cd8_umi_df['avg_umi_len'] = cd8_umi_df.umi.apply(avg_umi_len)

mhc_umi_df.sort_values(by=['gem','umi_count'], inplace=True) #,'avg_umi_len'
cd8_umi_df.sort_values(by=['gem','umi_count'], inplace=True) #,'avg_umi_len'

mhc_umi_count_lst_df = mhc_umi_df.groupby(['gem']).umi_count.apply(np.array).to_frame().reset_index().rename(columns={'umi_count':'umi_count_lst'})
cd8_umi_count_lst_df = cd8_umi_df.groupby(['gem']).umi_count.apply(np.array).to_frame().reset_index().rename(columns={'umi_count':'umi_count_lst'})

mhc_umi_count_lst_df['single_barcode'] = mhc_umi_count_lst_df.umi_count_lst.apply(lambda x: True if len(x)==1 else False)
cd8_umi_count_lst_df['single_barcode'] = cd8_umi_count_lst_df.umi_count_lst.apply(lambda x: True if len(x)==1 else False)

mhc_umi_count_lst_df['template_lst'] = mhc_umi_df.groupby(['gem']).template_id.apply(np.array).to_frame().reset_index().template_id
cd8_umi_count_lst_df['template_lst'] = cd8_umi_df.groupby(['gem']).template_id.apply(np.array).to_frame().reset_index().template_id

merge. sort on gem, umi, avg_umi. remove duplicates on gem.

## Annotate specificities

mhc_umi_df['barcode'], mhc_umi_df['sample'] = mhc_umi_df.template_id.str.rsplit("_", n=1).str

mhc_umi_df = pd.merge(mhc_umi_df, specificity_df[['barcode','epitope']], how='left', on='barcode')

## Annotate specificities

mhc_read_counts_df['barcode'], mhc_read_counts_df['sample'] = mhc_read_counts_df.template_id.str.rsplit("_", n=1).str

mhc_read_counts_df = pd.merge(mhc_read_counts_df, specificity_df[['barcode','epitope']], how='left', on='barcode')

## Collapse multiple annotations per GEM into one

mhc_read_counts_df.sort_values(by=['gem', 'read_counts'], inplace=True)
cd8_read_counts_df.sort_values(by=['gem', 'read_counts'], inplace=True)
mhc_read_count_diffs_df = mhc_read_counts_df.groupby(['gem']).read_counts.apply(np.array).to_frame().reset_index()
cd8_read_count_diffs_df = cd8_read_counts_df.groupby(['gem']).read_counts.apply(np.array).to_frame().reset_index()
mhc_read_count_diffs_df.rename(columns={'read_counts': 'read_counts_lst'}, inplace=True)
cd8_read_count_diffs_df.rename(columns={'read_counts': 'read_counts_lst'}, inplace=True)
mhc_read_count_diffs_df['read_count_diff'] = mhc_read_count_diffs_df.read_counts_lst.apply(lambda x: round((x[-1]-x[-2])/x[-1], 3) if len(x)>1 else 1.000)
cd8_read_count_diffs_df['read_count_diff'] = cd8_read_count_diffs_df.read_counts_lst.apply(lambda x: round((x[-1]-x[-2])/x[-1], 3) if len(x)>1 else 1.000)
mhc_read_count_diffs_df['single_barcode'] = mhc_read_count_diffs_df.read_counts_lst.apply(lambda x: True if len(x)==1 else False)
cd8_read_count_diffs_df['single_barcode'] = cd8_read_count_diffs_df.read_counts_lst.apply(lambda x: True if len(x)==1 else False)
mhc_read_count_diffs_df['template_lst'] = mhc_read_counts_df.groupby(['gem']).template_id.apply(np.array).to_frame().reset_index().template_id
cd8_read_count_diffs_df['template_lst'] = cd8_read_counts_df.groupby(['gem']).template_id.apply(np.array).to_frame().reset_index().template_id

mhc_read_count_diffs_df['epitope_lst'] = mhc_read_counts_df.groupby(['gem']).epitope.apply(np.array).to_frame().reset_index().epitope

pd.merge(cd8_unique_df[['gem', 'template_id']], cd8_df, on=['gem','template_id'], how='left')

cd8_mode_df = pd.merge(cd8_unique_df[['gem', 'template_id']], cd8_df, on=['gem','template_id'], how='left').merge(cd8_read_counts_df[['gem', 'template_id', 'read_counts']],
                                                                                                                  how='left', on=['gem','template_id']).merge(cd8_read_count_diffs_df, how='left', on='gem')
mhc_mode_df = pd.merge(mhc_unique_df[['gem', 'template_id']], mhc_df, on=['gem','template_id'], how='left').merge(mhc_read_counts_df[['gem', 'template_id', 'read_counts', 'epitope']],
                                                                                                                  how='left', on=['gem','template_id']).merge(mhc_read_count_diffs_df, how='left', on='gem')

cd8_mode_df.sort_values(by=['gem', 'score', 'alignment_length'], inplace=True) #'credible_alignment', 'match', 
mhc_mode_df.sort_values(by=['gem', 'score', 'alignment_length'], inplace=True) #'credible_alignment', 'match', 
cd8_mode_df.drop_duplicates(subset=['gem'], keep='last', inplace=True)
mhc_mode_df.drop_duplicates(subset=['gem'], keep='last', inplace=True)

print("MHC entries: %i" %mhc_mode_df.shape[0])
print("Unique GEMs: %i" %mhc_mode_df.gem.unique().shape[0])
#np.savetxt(GEM_LISTS + "mhc_mode_df.lst", mhc_mode_df.gem.unique(), fmt='%s')

print("CD8 entries: %i" %cd8_mode_df.shape[0])
print("Unique GEMs: %i" %cd8_mode_df.gem.unique().shape[0])
#np.savetxt(GEM_LISTS + "cd8_mode_df.lst", cd8_mode_df.gem.unique(), fmt='%s')

#### OBS!
At this point we have a table with one line per GEM. The barcode annotation is chosen from the majority vote of read counts. However, if two barcodes are annotated with equal number of reads, they are distinguished based on credible_alignment, match, bit_score, and alignment_length...
OBS! We are not using UMI - so we don't know if the read count is inflated. And we don't take into account the alignment score or the alignment length! What if the majority of reads map poorly?

## Produce barcode table

In [None]:
barcode_df = pd.merge(mhc_mode_df[['gem', 'template_id', 'template_lst', 'barcode', 'sample', 'read_counts', 'read_counts_lst', 'read_count_diff', 'single_barcode', 'credible_alignment', 'epitope_lst']],
                      cd8_mode_df[['gem', 'template_id', 'template_lst', 'barcode', 'sample', 'read_counts', 'read_counts_lst', 'read_count_diff', 'single_barcode', 'credible_alignment']],
                      how='outer', on='gem', suffixes=('_mhc', '_cd8')) #'match',

In [None]:
assert barcode_df.shape[0] == barcode_df.gem.unique().shape[0], "Barcode dataframe was not reduced satisfyingly"

In [None]:
print("Entries: %i" %barcode_df.shape[0])
print("Unique GEMs: %i" %barcode_df.gem.unique().shape[0])
#np.savetxt(GEM_LISTS + "barcode_df.lst", barcode_df.gem.unique(), fmt='%s')

## Specificity matrix

In [None]:
specificity_matrix = mhc_mode_df.pivot(index='gem', columns='epitope', values='read_counts')

## Response df

In [None]:
response_df.sort_values(by=['peptide','barcode_cd8'], inplace=True)
response_df.drop_duplicates(inplace=True)
response_df['detected_response'] = True

## Merge barcode and specificity

In [None]:
barcode_specificity_df = pd.merge(barcode_df[['gem',
                                              'credible_alignment_mhc',
                                              'credible_alignment_cd8',
                                              'template_id_mhc', 'barcode_mhc', 'sample_mhc', 'read_counts_mhc', 'read_count_diff_mhc', 'single_barcode_mhc', 'read_counts_lst_mhc', 'template_lst_mhc', 'epitope_lst',
                                              'template_id_cd8', 'barcode_cd8', 'sample_cd8', 'read_counts_cd8', 'read_count_diff_cd8', 'single_barcode_cd8', 'read_counts_lst_cd8', 'template_lst_cd8']],
                                  specificity_df,
                                  how='left',
                                  left_on='barcode_mhc',
                                  right_on='barcode').merge(response_df,
                                                            how='left',
                                                            on=['barcode_cd8', 'peptide']).merge(specificity_matrix, how='left', on='gem') # 'match_mhc','match_cd8', 

In [None]:
barcode_specificity_df['peptide_assayed'] = np.where(barcode_specificity_df.peptide.isin(response_df.peptide), True, False)

In [None]:
print("Entries: %i" %barcode_specificity_df.shape[0])
print("Unique GEMs: %i" %barcode_specificity_df.gem.unique().shape[0])

# Write data

In [None]:
new_column_order = ['gem',
 'credible_alignment_mhc',
 'credible_alignment_cd8',
 'template_id_mhc',
 'read_counts_mhc',
 'read_count_diff_mhc',
 'single_barcode_mhc',
 'read_counts_lst_mhc',
 'template_lst_mhc',
 'template_id_cd8',
 'read_counts_cd8',
 'read_count_diff_cd8',
 'single_barcode_cd8',
 'read_counts_lst_cd8',
 'template_lst_cd8',
 'detected_response',
 'peptide_assayed',
 'peptide',
 'HLA',
 'epitope',
 'epitope_lst'] + specificity_matrix.columns.to_list() #'match_mhc','match_cd8',

In [None]:
barcode_specificity_df[new_column_order].to_csv(output, index=False, sep='\t')