In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from ast import literal_eval
import re

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
plt.style.use('ggplot')

# Args

In [3]:
EXP = 'exp3'

In [4]:
def epitope_converter(x):
    #define format of datetime
    return x.replace("[","").replace("]","").replace("'","").split(" ")

def peptide_hla_converter(x):
    return re.findall("\w+\s{1}\w{1}\d+", x.replace("[","").replace("]","").replace("'",""))

In [5]:
converters={'peptide_HLA_lst': peptide_hla_converter, 'epitope_lst': epitope_converter, 'umi_count_lst_mhc': literal_eval}

# Input

In [6]:
credible_df_file = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_CAT_IONTORRENT_KMA_AKB/tables/tcr_barcode.cleaned.csv"

In [7]:
raw_tcr_df_file = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_TCR/processed/cellranger_out/TCR_VDJ/outs/all_contig_annotations.csv"

In [21]:
all_contigs_file = "/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" + EXP + "_TCR/processed/cellranger_out/TCR_VDJ/outs/all_contig.fasta"

# Load

In [8]:
credible_df = pd.read_csv(credible_df_file, converters=converters)

In [9]:
df = pd.read_csv(raw_tcr_df_file)

# Main

## Rename

In [10]:
df.rename(columns={"barcode" : "gem"}, inplace=True)
df.rename(columns={"raw_clonotype_id" : "clonotype"}, inplace=True)
df.rename(columns={"umis" : "umi_count"}, inplace=True)
df.rename(columns={"reads" : "read_count"}, inplace=True)

## Filter data

In [11]:
keep_only_is_cells = False
keep_only_high_confidence = False
keep_only_full_length = True
keep_only_productive = True
keep_only_unamibiguous_gems = False

In [12]:
if keep_only_is_cells:
    df = df[df.is_cell == True]
if keep_only_high_confidence:
    df = df[df.high_confidence == True]
if keep_only_full_length:
    df = df[df.full_length == True]
if keep_only_productive:
    df = df[df.productive == 'True']
if keep_only_unamibiguous_gems:
    df = df.groupby(['gem', 'chain']).filter(lambda x: len(x) == 1)

## Augment by chain

In [13]:
def annotate_lst(df, var):
    dct = df.groupby(['gem'])[var].apply(np.array).to_dict()
    return df.gem.map(dct)

def annotate_single(df):
    return df.umi_count_lst.apply(lambda x: True if len(x)==1 else False)

In [14]:
tra_df = df[(df.chain == "TRA")].copy()
trb_df = df[(df.chain == "TRB")].copy()

tra_df.sort_values(by=['gem', 'umi_count'], inplace=True)
trb_df.sort_values(by=['gem', 'umi_count'], inplace=True)

tra_df['umi_count_lst'] = annotate_lst(tra_df, 'umi_count')
trb_df['umi_count_lst'] = annotate_lst(trb_df, 'umi_count')

tra_df['cdr3_lst'] = annotate_lst(tra_df, 'cdr3')
trb_df['cdr3_lst'] = annotate_lst(trb_df, 'cdr3')

tra_df['single'] = annotate_single(tra_df)
trb_df['single'] = annotate_single(trb_df)

### Keep chain with highest UMI count and merge

In [15]:
tra_df.drop_duplicates(subset=['gem','clonotype','single'], keep='last', inplace=True)
trb_df.drop_duplicates(subset=['gem','clonotype','single'], keep='last', inplace=True)
tcr_df = pd.merge(tra_df, trb_df, how='outer', on=['gem','clonotype'], suffixes=('_TRA', '_TRB'))

In [16]:
tcr_df

Unnamed: 0,gem,is_cell_TRA,contig_id_TRA,high_confidence_TRA,length_TRA,chain_TRA,v_gene_TRA,d_gene_TRA,j_gene_TRA,c_gene_TRA,...,full_length_TRB,productive_TRB,cdr3_TRB,cdr3_nt_TRB,read_count_TRB,umi_count_TRB,raw_consensus_id_TRB,umi_count_lst_TRB,cdr3_lst_TRB,single_TRB
0,AAACCTGAGAATTGTG-1,True,AAACCTGAGAATTGTG-1_contig_1,True,548.0,TRA,TRAV29DV5,,TRAJ45,TRAC,...,True,True,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,1414.0,19.0,clonotype2_consensus_1,[19],[CASSAWTSNRDEQFF],True
1,AAACCTGAGACTAGGC-1,True,AAACCTGAGACTAGGC-1_contig_3,True,525.0,TRA,TRAV27,,TRAJ28,TRAC,...,True,True,CASSPLSLNTEAFF,TGTGCCAGCAGCCCGCTTAGCTTGAACACTGAAGCTTTCTTT,697.0,6.0,clonotype3_consensus_2,[6],[CASSPLSLNTEAFF],True
2,AAACCTGAGAGTCTGG-1,True,AAACCTGAGAGTCTGG-1_contig_2,True,692.0,TRA,TRAV8-6,,TRAJ41,TRAC,...,True,True,CASSFDRDEQYF,TGTGCCAGCAGTTTCGACAGGGACGAGCAGTACTTC,6127.0,84.0,clonotype29_consensus_2,[84],[CASSFDRDEQYF],True
3,AAACCTGAGGCCGAAT-1,True,AAACCTGAGGCCGAAT-1_contig_4,True,584.0,TRA,TRAV12-2,,TRAJ23,TRAC,...,True,True,CASSFQGAETQYF,TGTGCCAGCAGCTTCCAGGGGGCGGAGACCCAGTACTTC,2047.0,22.0,clonotype519_consensus_1,"[1, 22]","[CASSAWTSNRDEQFF, CASSFQGAETQYF]",False
4,AAACCTGAGGTCATCT-1,False,AAACCTGAGGTCATCT-1_contig_1,True,518.0,TRA,TRAV20,,TRAJ28,TRAC,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13413,TTTGTCAGTACAGCAG-1,,,,,,,,,,...,True,True,CSARDLLQRGGPYNEQFF,TGCAGTGCTAGAGATCTTTTACAGCGGGGGGGCCCCTACAATGAGC...,58.0,1.0,,[1],[CSARDLLQRGGPYNEQFF],True
13414,TTTGTCAGTTTGTTTC-1,,,,,,,,,,...,True,True,CASSFLTGGNRDYGYTF,TGTGCCAGCAGCTTTCTCACAGGGGGCAACCGGGACTATGGCTACA...,1086.0,12.0,clonotype265_consensus_1,[12],[CASSFLTGGNRDYGYTF],True
13415,TTTGTCATCAAGCCTA-1,,,,,,,,,,...,True,True,CASSVTRDRTNTGELFF,TGTGCCAGCAGCGTAACCCGGGACAGGACCAACACCGGGGAGCTGT...,145.0,2.0,,[2],[CASSVTRDRTNTGELFF],True
13416,TTTGTCATCGGAGGTA-1,,,,,,,,,,...,True,True,CASSSKDRHMNTEAFF,TGTGCCAGCAGTTCAAAGGACAGACACATGAACACTGAAGCTTTCTTT,106.0,1.0,,[1],[CASSSKDRHMNTEAFF],True


## Select GEMs for netTCR

In [17]:
flt_df = credible_df[(credible_df.umi_count_mhc >= 2) & (credible_df.delta_umi_mhc > 0.9)]

In [18]:
flt_gems = flt_df.gem.unique()

In [54]:
flt_tcr_df = tcr_df[tcr_df.gem.isin(flt_gems)].copy()

In [66]:
flt_tcr_df[['gem',
        'clonotype',
        'v_gene_TRA',
        'd_gene_TRA',
        'j_gene_TRA',
        'c_gene_TRA','cdr3_TRA',
        'cdr3_nt_TRA',
        'full_seq_TRA',
        'v_gene_TRB',
        'd_gene_TRB',
        'j_gene_TRB',
        'c_gene_TRB',
        'cdr3_TRB',
        'cdr3_nt_TRB',
        'full_seq_TRB']]

Unnamed: 0,gem,clonotype,v_gene_TRA,d_gene_TRA,j_gene_TRA,c_gene_TRA,cdr3_TRA,cdr3_nt_TRA,full_seq_TRA,v_gene_TRB,d_gene_TRB,j_gene_TRB,c_gene_TRB,cdr3_TRB,cdr3_nt_TRB,full_seq_TRB
5,AAACCTGAGTTCGATC-1,clonotype9,TRAV14DV4,,TRAJ9,TRAC,CALNTGGFKTIF,TGTGCGCTGAATACTGGAGGCTTCAAAACTATCTTT,GGCCCAGTAATTCTTCTCTCACCATGCCAGGTTCACCTCACAGTAC...,TRBV12-4,TRBD2,TRBJ2-7,TRBC2,CASSPPFLAGSGSSYEQYF,TGTGCCAGCAGCCCCCCCTTTTTAGCTGGTAGCGGGAGCTCCTACG...,GGCTCATGTTCACAGAGGGCCTGGTCTGGAATATTCCACATCTGCT...
53,AAAGCAAGTATTAGCC-1,clonotype2,TRAV29DV5,,TRAJ45,TRAC,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,GCAGCTTTCTAGGCAGGAGACAAGACAATCTGCATCTTCACAGGAG...,TRBV9,TRBD2,TRBJ2-1,TRBC2,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,GAGAATGCTTACTACAGAGACACCAGCCCCAAGCTAGGAGATCCTG...
80,AAATGCCTCGTCCAGG-1,clonotype548,TRAV1-2,,TRAJ20,TRAC,CAVGDYKLSF,TGTGCTGTAGGCGACTACAAGCTCAGCTTT,CAGTGGCGCGATCTCTGCTCACTGCAAACTCCGCCTCCCGGGTTCC...,TRBV6-3,TRBD1,TRBJ2-4,TRBC2,CASSARDRKNIQYF,TGTGCCAGCAGTGCTCGGGACAGAAAAAACATTCAGTACTTC,GGTCTCAGAATGACGCCCTTGAAAGACGTGTTCCCTTTTCACCAAT...
81,AAATGCCTCGTTTGCC-1,clonotype231,TRAV38-2DV8,,TRAJ33,TRAC,CAYRYMDSNYQLIW,TGTGCTTATAGGTACATGGATAGCAACTATCAGTTAATCTGG,AGGTTCAGATCAGAAGAGGAGGCTTCTCACCCTGCAGCAGGGACCT...,TRBV2,TRBD1,TRBJ2-3,TRBC2,CASGGQGLFDTQYF,TGTGCCAGCGGGGGACAGGGCCTTTTCGATACGCAGTATTTT,GAGACCTTGCCTGTGGGGCCATGGGAGCTCAAAATGCCCCTCCTTT...
94,AACACGTGTATATGAG-1,clonotype553,TRAV3,,TRAJ31,TRAC,CAVRDISARLMF,TGTGCTGTGAGAGACATAAGTGCCAGACTCATGTTT,GAGTCTTGCTCCTCACAGAGCTTTGAGGAGCTGGATCAAAATTGTG...,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13117,TTAACTCAGTGTACCT-1,clonotype11,,,,,,,,TRBV9,TRBD2,TRBJ2-1,TRBC2,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,GAGAATGCTTACTACAGAGACACCAGCCCCAAGCTAGGAGATCCTG...
13122,TTAACTCCATTAGCCA-1,clonotype14,,,,,,,,TRBV6-5,TRBD2,TRBJ2-1,TRBC2,CASRLLAGVINEQFF,TGTGCCAGCAGACTACTAGCGGGGGTTATCAATGAGCAGTTCTTC,GAATGTCTCAGAATGACTTCCTTGAGAGTCCTGCTCCCCTTTCATC...
13182,TTATGCTTCACAAACC-1,,,,,,,,,TRBV9,TRBD2,TRBJ2-1,TRBC2,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,AGATCCTGCCATGGGCTTCAGGCTCCTCTGCTGTGTGGCCTTTTGT...
13292,TTGACTTGTTAAGATG-1,,,,,,,,,TRBV4-3,TRBD2,TRBJ2-1,TRBC2,CASSPGLLFNNEQFF,TGCGCCAGCAGCCCGGGACTCCTTTTCAACAATGAGCAGTTCTTC,ATCTCAGACCCGAGGCTAGCATGGGCTGCAGGCTGCTCTGCTGTGC...


In [56]:
fasta_dict = SeqIO.index(all_contigs_file, "fasta")

In [57]:
dct = dict()
for contig_id in flt_tcr_df.contig_id_TRA.dropna():
    dct[contig_id] = str(fasta_dict[contig_id].seq)
flt_tcr_df['full_seq_TRA'] = flt_tcr_df.contig_id_TRA.map(dct)

In [60]:
dct = dict()
for contig_id in flt_tcr_df.contig_id_TRB.dropna():
    dct[contig_id] = str(fasta_dict[contig_id].seq)
flt_tcr_df['full_seq_TRB'] = flt_tcr_df.contig_id_TRB.map(dct)

In [69]:
flt_tcr_bc_df = pd.merge(flt_tcr_df, flt_df[['gem', 'clonotype', 'peptide','HLA','epitope']], how='left', on=['gem', 'clonotype'])

In [70]:
flt_tcr_bc_df[['gem',
        'clonotype',
        'v_gene_TRA',
        'd_gene_TRA',
        'j_gene_TRA',
        'c_gene_TRA','cdr3_TRA',
        'cdr3_nt_TRA',
        'full_seq_TRA',
        'v_gene_TRB',
        'd_gene_TRB',
        'j_gene_TRB',
        'c_gene_TRB',
        'cdr3_TRB',
        'cdr3_nt_TRB',
        'full_seq_TRB',
        'peptide', 'HLA', 'epitope']].to_csv("netTCR_data.v01.csv.gz", index=False)

In [71]:
flt_tcr_bc_df[['gem',
        'clonotype',
        'v_gene_TRA',
        'd_gene_TRA',
        'j_gene_TRA',
        'c_gene_TRA','cdr3_TRA',
        'cdr3_nt_TRA',
        'full_seq_TRA',
        'v_gene_TRB',
        'd_gene_TRB',
        'j_gene_TRB',
        'c_gene_TRB',
        'cdr3_TRB',
        'cdr3_nt_TRB',
        'full_seq_TRB',
        'peptide', 'HLA', 'epitope']]

Unnamed: 0,gem,clonotype,v_gene_TRA,d_gene_TRA,j_gene_TRA,c_gene_TRA,cdr3_TRA,cdr3_nt_TRA,full_seq_TRA,v_gene_TRB,d_gene_TRB,j_gene_TRB,c_gene_TRB,cdr3_TRB,cdr3_nt_TRB,full_seq_TRB,peptide,HLA,epitope
0,AAACCTGAGTTCGATC-1,clonotype9,TRAV14DV4,,TRAJ9,TRAC,CALNTGGFKTIF,TGTGCGCTGAATACTGGAGGCTTCAAAACTATCTTT,GGCCCAGTAATTCTTCTCTCACCATGCCAGGTTCACCTCACAGTAC...,TRBV12-4,TRBD2,TRBJ2-7,TRBC2,CASSPPFLAGSGSSYEQYF,TGTGCCAGCAGCCCCCCCTTTTTAGCTGGTAGCGGGAGCTCCTACG...,GGCTCATGTTCACAGAGGGCCTGGTCTGGAATATTCCACATCTGCT...,YSEHPTFTSQY,A0101,v9
1,AAAGCAAGTATTAGCC-1,clonotype2,TRAV29DV5,,TRAJ45,TRAC,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,GCAGCTTTCTAGGCAGGAGACAAGACAATCTGCATCTTCACAGGAG...,TRBV9,TRBD2,TRBJ2-1,TRBC2,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,GAGAATGCTTACTACAGAGACACCAGCCCCAAGCTAGGAGATCCTG...,TVYPPSSTAK,A0301,v23
2,AAATGCCTCGTCCAGG-1,clonotype548,TRAV1-2,,TRAJ20,TRAC,CAVGDYKLSF,TGTGCTGTAGGCGACTACAAGCTCAGCTTT,CAGTGGCGCGATCTCTGCTCACTGCAAACTCCGCCTCCCGGGTTCC...,TRBV6-3,TRBD1,TRBJ2-4,TRBC2,CASSARDRKNIQYF,TGTGCCAGCAGTGCTCGGGACAGAAAAAACATTCAGTACTTC,GGTCTCAGAATGACGCCCTTGAAAGACGTGTTCCCTTTTCACCAAT...,SLADTNSLAV,A0201,MELPEP-043
3,AAATGCCTCGTTTGCC-1,clonotype231,TRAV38-2DV8,,TRAJ33,TRAC,CAYRYMDSNYQLIW,TGTGCTTATAGGTACATGGATAGCAACTATCAGTTAATCTGG,AGGTTCAGATCAGAAGAGGAGGCTTCTCACCCTGCAGCAGGGACCT...,TRBV2,TRBD1,TRBJ2-3,TRBC2,CASGGQGLFDTQYF,TGTGCCAGCGGGGGACAGGGCCTTTTCGATACGCAGTATTTT,GAGACCTTGCCTGTGGGGCCATGGGAGCTCAAAATGCCCCTCCTTT...,GPISGHVLK,A1101,v27
4,AACACGTGTATATGAG-1,clonotype553,TRAV3,,TRAJ31,TRAC,CAVRDISARLMF,TGTGCTGTGAGAGACATAAGTGCCAGACTCATGTTT,GAGTCTTGCTCCTCACAGAGCTTTGAGGAGCTGGATCAAAATTGTG...,,,,,,,,NLVPMVATV,A0201,v13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
766,TTAACTCAGTGTACCT-1,clonotype11,,,,,,,,TRBV9,TRBD2,TRBJ2-1,TRBC2,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,GAGAATGCTTACTACAGAGACACCAGCCCCAAGCTAGGAGATCCTG...,SLAAYIPRL,A0201,CLYBL
767,TTAACTCCATTAGCCA-1,clonotype14,,,,,,,,TRBV6-5,TRBD2,TRBJ2-1,TRBC2,CASRLLAGVINEQFF,TGTGCCAGCAGACTACTAGCGGGGGTTATCAATGAGCAGTTCTTC,GAATGTCTCAGAATGACTTCCTTGAGAGTCCTGCTCCCCTTTCATC...,NLVPMVATV,A0201,v13
768,TTATGCTTCACAAACC-1,,,,,,,,,TRBV9,TRBD2,TRBJ2-1,TRBC2,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,AGATCCTGCCATGGGCTTCAGGCTCCTCTGCTGTGTGGCCTTTTGT...,SLAAYIPRL,A0201,CLYBL
769,TTGACTTGTTAAGATG-1,,,,,,,,,TRBV4-3,TRBD2,TRBJ2-1,TRBC2,CASSPGLLFNNEQFF,TGCGCCAGCAGCCCGGGACTCCTTTTCAACAATGAGCAGTTCTTC,ATCTCAGACCCGAGGCTAGCATGGGCTGCAGGCTGCTCTGCTGTGC...,YSEHPTFTSQY,A0101,v9
