In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
plt.style.use('ggplot')

In [11]:
from ipynb.fs.full.D_plot_specificity_matrix_utils import epitope_sorter_index

# Args

In [3]:
EXP = "exp3"
PLATFORM = "IONTORRENT"

In [4]:
MAPPING = 'BLAST' # BLAST
BARCODE_SYSTEM = 'AKB' #'AKB' #

## Input

In [5]:
IN_FILE = ("/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" +
           EXP + "_CAT_" + PLATFORM + "_" + MAPPING + "_" + BARCODE_SYSTEM +
           "/tables/tcr_barcode.clean.csv")

## Output

In [6]:
OUT_DIR = ("/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" +
           EXP + "_CAT_" + PLATFORM + "_" + MAPPING + "_" + BARCODE_SYSTEM + "/similarity_assessment/")

In [6]:
all_on_all_alpha = OUT_DIR + "m1.all.a.txt"
all_on_all_beta = OUT_DIR + "m1.all.b.txt"

In [7]:
tcr_sim_per_peptide = OUT_DIR + "m2.txt"

In [8]:
peptide_multiple = OUT_DIR + "m3.multiple.txt"

In [9]:
peptide_singlet = OUT_DIR + "m3.singlet.txt"

# Import input data

In [16]:
df = pd.read_csv(IN_FILE, usecols=['gem', 'clonotype', 'cdr3_TRA', 'cdr3_TRB', 'epitope', 'peptide'])

# Main

Remove GEMs with unpaired TCR chains

In [18]:
df.replace({'cdr3_TRA': "", 'cdr3_TRB': ""}, np.nan, inplace=True)
df.dropna(subset=(['cdr3_TRA', 'cdr3_TRB']), inplace=True)

Sort according to plots

In [19]:
df['epitope_rank'] = epitope_sorter_index(df)
df.sort_values(by=['epitope_rank', 'clonotype'], inplace=True)
df.reset_index(drop=True, inplace=True)

Write temporary file

In [56]:
matrix = {'A': df[['gem','cdr3_TRA']].copy(), 'B': df[['gem','cdr3_TRB']].copy()}

In [None]:
np.savetxt('tmp_A2', np.array([df.cdr3_TRA]), fmt='%s')
np.savetxt('tmp_B2', np.array([df.cdr3_TRB]), fmt='%s')

In [38]:
for index, row in df.iterrows(): #

    tcr = row.clonotype
    
    # assert that each clonotype only contains a single combination of TRA and TRB
    assert df[df.clonotype==tcr].groupby(['cdr3_TRA', 'cdr3_TRB']).size().values.shape == (1,), "clonotype was annotated with multiple cdr3 chains"

    for chain in ['A', 'B']:
        F1 = 'tmp_%s1' %chain
        F2 = 'tmp_%s2' %chain
        cdr3_name = 'cdr3_TR%s' %chain
        
        np.savetxt(F1, np.array([row[cdr3_name]]), fmt='%s') #np.savetxt('tmp_A1', np.array([row.cdr3_TRA]), fmt='%s')
        
        sim_df = run_cmd(cmd)
        
        assert sim_df.shape[0] == matrix['chain'].shape[0], "The sizes do not match"
        assert sim_df.seq1.unique()[0] == row[cdr3_name], "The tested sequence does not match the CDR3 sequence"
        

        matrix['chain'] = pd.concat([matrix['chain'], sim_df[['similarity']]])
        assert df.shape[0] == matrix['chain'].shape[0], "The sizes do not match"
        
        matrix['chain'].rename(columns={'similarity': row.gem}, inplace=True)
        
    
    
    break

matrix['A'].set_index(['gem'], inplace=True)
matrix['B'].set_index(['gem'], inplace=True)

matrix_C = matrix['A'].add(matrix['B'])

matrix_C.to_csv(OUT_DIR + 'MC.csv', index=True) # GEMs are indexes, thus DO write index to file

['CAVRSAYSGAGSYQLTF']


OBS! Remove all tmp files when script is done!

In [64]:
df1 = pd.DataFrame([('A',2),('B',4),('C',6)], columns=['a','b'])
df2 = pd.DataFrame([('D',20),('D',40),('D',60)], columns=['a','b'])
df_add = df1.add(df2, fill_value=0)

In [62]:
df_add

Unnamed: 0,a,b
0,AD,22
1,BD,44
2,CD,66


In [14]:
df.groupby(['epitope', 'cdr3_TRA']).size()

epitope  cdr3_TRA          
CLYBL    CAAKSDSGGGADGLTF       5
         CAVRDISARLMF           1
v13      CAAKSDSGGGADGLTF      61
         CAAPRMEYGNKLVF         1
         CAASENNDMRF            1
         CAASIRNTGGFKTIF        1
         CAESPPTGANNLFF         1
         CAFEAGNQFYF            4
         CAFRARVDYGGSQGNLIF     1
         CAGARGETSGSRLTF        1
         CAGEEWDDYKLSF          1
         CAGGGSSNTGKLIF        36
         CAHLDSNYQLIW           1
         CALRSHNTNAGKSTF        1
         CAMSAYYGQNFVF          1
         CARGFLNYGGSQGNLIF      1
         CARNTGNQFYF           17
         CASLGAGTALIF           1
         CAVRDGQKLLF            1
         CAVRDISARLMF          16
         CAVRPIQGAQKLVF         1
         CAVSDPLVF              1
         CAVSLGGGYNKLIF         1
         CGAVAPWGGATNKLIF       1
         CILDNNNDMRF            1
         CVVNDPDKLIF            1
         CVVNRRQAGTALIF        24
dtype: int64

In [20]:
df.clonotype.values

array(['clonotype1', 'clonotype1', 'clonotype1', ..., 'clonotype97',
       'clonotype977', 'clonotype985'], dtype=object)

In [67]:
df[df.clonotype=='clonotype30']

Unnamed: 0,gem,clonotype,cdr3_TRA,cdr3_TRB,peptide,epitope,epitope_rank
1365,GCGAGAAAGAACTGTA-1,clonotype30,CVVSAGGNARLMF,CASSEDRGPYNGETQYF,SLAAYIPRL,CLYBL,0.0
1037,CTAGAGTCAACGATCT-1,clonotype30,CLVGYNTDKLIF,CASSEDRGPYNGETQYF,VTEHDTLLY,v15,2.0
41,AACGTTGAGATATGGT-1,clonotype30,CVVSAGGNARLMF,CASSEDRGPYNGETQYF,RLPRIFCSC,MELPEP-041,
310,AGCAGCCTCCGCAAGC-1,clonotype30,CVVSAGGNARLMF,CASSEDRGPYNGETQYF,KTWGQYWQV,MELPEP-044,
