In [6]:
import numpy as np
import pandas as pd
from Bio import SeqIO

# Input

In [2]:
SPECIFICITY = '/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp9_MHC_IONTORRENT/barcode_library/barcode_specificity_annotations.xlsx'
LIBRARY = '/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp9_MHC_IONTORRENT/barcode_library/barcode_templates.fa'

# Output

In [44]:
OUTPUT = '/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp9.2_TCR/processed/cellranger_out/feature_barcode_ref.exp9.csv'

# Load

In [7]:
fa = SeqIO.to_dict(SeqIO.parse(LIBRARY, "fasta"))

In [22]:
mhc_df = pd.read_excel(SPECIFICITY, sheet_name='MHC')
cdx_df = pd.read_excel(SPECIFICITY, sheet_name='CDX')

In [25]:
mhc_df.rename(columns={'Barcode':'id','Sequence':'name','sample_id':'name'}, inplace=True)
cdx_df.rename(columns={'Barcode':'id','Sequence':'name','sample_id':'name'}, inplace=True)

In [26]:
def get_seq(barcode):
    # Return the oligo B + anneal + oligo A part of the template
    return str(fa[barcode].seq).split('NNNNNN')[1]

In [27]:
mhc_df['sequence'] = mhc_df.apply(lambda row: get_seq(row.id), axis=1)
cdx_df['sequence'] = cdx_df.apply(lambda row: get_seq(row.id), axis=1)

In [40]:
mhc_df['read'] = 'R1'
cdx_df['read'] = 'R1'
mhc_df['pattern'] = '5P' + 'N'*64 + '(BC)'
cdx_df['pattern'] = '5P' + 'N'*64 + '(BC)'
mhc_df['feature_type'] = 'Antibody Capture'
cdx_df['feature_type'] = 'Antibody Capture'

In [41]:
cdx_df

Unnamed: 0,id,name,HLA_A,HLA_B,HLA_C,comment,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,sequence,read,pattern,feature_type
0,A4000B303,BC-317_AB,"A0101, A3201","B0702, B1401","C0702, C0802",PBMC,,,,,,,,,TAGGTATTCGGGCATTTCACTGTCACTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture
1,A4000B304,BC-328_AB,"A0301, A2402","B0702, B0801","C0701, C0702",PBMC,,,,,,,,,TTTACTGCGTATGGAGCGCTTCGACCTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture
2,A4000B305,BC-329_AB,"A0201, A2501","B0702, B3501","C0401, C0702",PBMC,,,,,,,,,TCTAGCCCCTCAGGCTTAGGGATATCTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture
3,A4000B306,BC-337_AB,"A0201, A0301","B1402, B4001","C0304, C0802",PBMC,,,,,,,,,GCATAAGATAGAGGCGCGGTACGCACTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture
4,A1072B309,BC-317,"A0101, A3201","B0702, B1401","C0702, C0802",PBMC,,,,,,,,,CGATACAAAGCATTCCTGCCGGCAGCTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture
5,A1072B310,BC-328,"A0301, A2402","B0702, B0801","C0701, C0702",PBMC,,,,,,,,,TGAGGTCATACTGCATGCGAATACCCTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture
6,A1072B313,BC-329,"A0201, A2501","B0702, B3501","C0401, C0702",PBMC,,,,,,,,,TTCCTAGCCGATTTAGGGATCGCCTCTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture
7,A1072B314,BC-337,"A0201, A0301","B1402, B4001","C0304, C0802",PBMC,,,,,,,,,GGGTGGGTCCATACAGGAAAGTCTCCTACGCCTTTTGGGGAAGGTC...,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,Antibody Capture


In [42]:
df = pd.concat([mhc_df[['id','name','read','pattern','sequence','feature_type']],
                cdx_df[['id','name','read','pattern','sequence','feature_type']]])

In [43]:
df

Unnamed: 0,id,name,read,pattern,sequence,feature_type
0,A1072B303,VTEHDTLLY,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,TAGGTATTCGGGCATTTCACTGTCACTACGCCTTTTGGGGAAGGTC...,Antibody Capture
1,A1073B303,ATTADVDAGSL,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,TAGGTATTCGGGCATTTCACTGTCACTACGCCTTTTGGGGAAGGTC...,Antibody Capture
2,A1074B303,SVDIHFDPG,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,TAGGTATTCGGGCATTTCACTGTCACTACGCCTTTTGGGGAAGGTC...,Antibody Capture
3,A1075B303,QAYLTNQY,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,TAGGTATTCGGGCATTTCACTGTCACTACGCCTTTTGGGGAAGGTC...,Antibody Capture
4,A1076B303,GMAEGATTA,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,TAGGTATTCGGGCATTTCACTGTCACTACGCCTTTTGGGGAAGGTC...,Antibody Capture
...,...,...,...,...,...,...
3,A4000B306,BC-337_AB,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,GCATAAGATAGAGGCGCGGTACGCACTACGCCTTTTGGGGAAGGTC...,Antibody Capture
4,A1072B309,BC-317,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CGATACAAAGCATTCCTGCCGGCAGCTACGCCTTTTGGGGAAGGTC...,Antibody Capture
5,A1072B310,BC-328,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,TGAGGTCATACTGCATGCGAATACCCTACGCCTTTTGGGGAAGGTC...,Antibody Capture
6,A1072B313,BC-329,R1,5PNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,TTCCTAGCCGATTTAGGGATCGCCTCTACGCCTTTTGGGGAAGGTC...,Antibody Capture


In [45]:
df.to_csv(OUTPUT, index=False)