In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import subprocess

In [5]:
plt.style.use('ggplot')

# Args

In [6]:
EXP = "exp3"
PLATFORM = "IONTORRENT"

In [7]:
MAPPING = 'BLAST' # BLAST
BARCODE_SYSTEM = 'AKB' #'AKB' #

## Input

In [8]:
IN_FILE = ("/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" +
           EXP + "_CAT_" + PLATFORM + "_" + MAPPING + "_" + BARCODE_SYSTEM +
           "/tables/tcr_barcode.clean.csv")

## Output
m1: all TCR CDR3s against all other CDR3s

m2: similarity between TCRs binding the sampe peptide against TCRs binding all other peptides

m3: similarity between peptides in cross-recognition against peptides uniquely recognized

In [6]:
OUT_DIR = ("/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/" +
           EXP + "_CAT_" + PLATFORM + "_" + MAPPING + "_" + BARCODE_SYSTEM + "/similarity_assessment/")

In [7]:
m1_a = OUT_DIR + "m1.all.a.lst"
m1_b = OUT_DIR + "m1.all.b.lst"

In [8]:
## lav et dict med filehandles og filenames
#m2_a = OUT_DIR + "m2.p1.a.txt"
#m2_b = OUT_DIR + "m2.p1.b.txt"
#
#m2_a = OUT_DIR + "m2.pr.a.txt"
#m2_b = OUT_DIR + "m2.pr.b.txt"

In [9]:
## Lav et dict med filehandles
#m3_a = OUT_DIR + "m3.multiple.a.txt"
#m3_b = OUT_DIR + "m3.multiple.b.txt"
#
#peptide_singlet = OUT_DIR + "m3.singlet.txt"

# Import input data

In [9]:
df = pd.read_csv(IN_FILE)

# Main

In [11]:
df.shape

(2222, 41)

In [12]:
df.replace({'cdr3_TRA': "", 'cdr3_TRB': ""}, np.nan, inplace=True)
df.dropna(subset=(['cdr3_TRA', 'cdr3_TRB']), inplace=True)

In [13]:
df.shape

(1854, 41)

In [14]:
df.cdr3_TRA.describe()

count                 1854
unique                 556
top       CAAKSDSGGGADGLTF
freq                   211
Name: cdr3_TRA, dtype: object

In [15]:
df.cdr3_TRB.describe()

count                1854
unique                555
top       CASSAWTSNRDEQFF
freq                  216
Name: cdr3_TRB, dtype: object

In [16]:
df.columns

Index(['gem', 'clonotype', 'cdr3_TRA', 'cdr3_nt_TRA', 'cdr3_TRB',
       'cdr3_nt_TRB', 'umis_TRA', 'umis_diff_TRA', 'single_TRA',
       'umis_lst_TRA', 'cdr3_lst_TRA', 'umis_TRB', 'umis_diff_TRB',
       'single_TRB', 'umis_lst_TRB', 'cdr3_lst_TRB', 'credible_alignment_mhc',
       'credible_alignment_cd8', 'template_id_mhc', 'read_counts_mhc',
       'read_count_diff_mhc', 'single_barcode_mhc', 'read_counts_lst_mhc',
       'template_lst_mhc', 'template_id_cd8', 'read_counts_cd8',
       'read_count_diff_cd8', 'single_barcode_cd8', 'read_counts_lst_cd8',
       'template_lst_cd8', 'detected_response', 'peptide_assayed', 'peptide',
       'HLA', 'epitope', 'epitope_lst', 'Unnamed: 21', 'num_clonotype',
       'single_tcell', 'umis_tcr', 'peptide_HLA'],
      dtype='object')

In [17]:
df.sort_values(by="num_clonotype", inplace=True)

In [18]:
assert df[df.clonotype=='clonotype1'].groupby(['cdr3_TRA', 'cdr3_TRB']).size().values.shape == (1,), "clonotype was annotated with multiple cdr3 chains"

# Test

In [3]:
import os
import shutil
import subprocess
import tempfile
import threading
from contextlib import contextmanager    
import pandas as pd

@contextmanager
def named_pipes(count):
    dirname = tempfile.mkdtemp()
    try:
        paths = []
        for i in range(count):
            paths.append(os.path.join(dirname, 'named_pipe' + str(i)))
            os.mkfifo(paths[-1])
        yield paths
    finally:
        shutil.rmtree(dirname)

def write_command_input(df, path):
    df.to_csv(path, header=False,index=False, sep="\t")

dfA = pd.DataFrame(['CAVRSAYSGAG'])
dfB = pd.DataFrame(['CAARLIQGAQKLVF', 'CAGPSYNTDKLIF', 'CAMPNSGGYQKVTF', 'CAMNRDDKIIF', 'CAVRSAYSGAGSYQLTF'])

with named_pipes(2) as paths:
    p = subprocess.Popen(["cat"] + paths, stdout=subprocess.PIPE)
    with p.stdout:
        for df, path in zip([dfA, dfB], paths):
            t = threading.Thread(target=write_command_input, args=[df, path]) 
            t.daemon = True
            t.start()
        result = pd.read_csv(p.stdout, header=None, sep="\t")
p.wait()

print(result)

                   0
0        CAVRSAYSGAG
1     CAARLIQGAQKLVF
2      CAGPSYNTDKLIF
3     CAMPNSGGYQKVTF
4        CAMNRDDKIIF
5  CAVRSAYSGAGSYQLTF


## Run seq2score

In [19]:
import os
import shutil
import subprocess
import tempfile
import threading
from contextlib import contextmanager

In [56]:
#https://stackoverflow.com/questions/31589839/allowing-multiple-inputs-to-python-subprocess/31599261#31599261

@contextmanager
def named_pipes(count):
    dirname = tempfile.mkdtemp()
    try:
        paths = []
        for i in range(count):
            paths.append(os.path.join(dirname, 'named_pipe' + str(i)))
            os.mkfifo(paths[-1])
        yield paths
    finally:
        shutil.rmtree(dirname)

def write_command_input(lst, path):
    #lst.to_csv(path, header=False,index=False, sep="\t")
    np.savetxt(path, lst, fmt='%s')

In [32]:
lstS = df[df.num_clonotype==1].cdr3_TRA.unique() #pd.DataFrame([[1,2,3],[3,4,5]], columns=["A","B","C"])
lstM = df.cdr3_TRA.values #pd.DataFrame([[5,6,7],[6,7,8]], columns=["A","B","C"])

In [61]:
np.savetxt("/Volumes/tuba/herpov/tcr-pmhc-sc-project/notebooks/lstM.txt", lstM, fmt='%s')
np.savetxt("/Volumes/tuba/herpov/tcr-pmhc-sc-project/notebooks/lstS.txt", lstS, fmt='%s')

In [59]:
def run_seq2score():
    seq2score="./Volumes/tuba-nobackup/shared/seq2score_db_kernel"
    BLF="/Volumes/tuba-nobackup/shared/BLOSUM50"
    QIJ="/Volumes/tuba-nobackup/shared/blosum62.qij"
    
    cmd = seq2score + ' -blf ' + BLF + ' -blqij ' + QIJ + ' -pa '
    
    with named_pipes(2) as paths:
        p = subprocess.Popen([cmd] + paths, stdout=subprocess.PIPE, shell=True)
        with p.stdout:
            for lst, path in zip([lstS, lstM], paths):
                t = threading.Thread(target=write_command_input, args=[lst, path]) 
                t.daemon = True
                t.start()
            result = pd.read_csv(p.stdout, sep=" ", names=['seq1', 'seq2', 'similarity'], usecols=[1,2,3], comment='#')
    p.wait()
    return result

In [60]:
run_seq2score()

TypeError: can only concatenate str (not "list") to str

In [156]:
seq2score="/home/tuba/herpov/tcr-pmhc-sc-project/tools/seq2score_db_kernel"
BLF="/home/tuba/herpov/tcr-pmhc-sc-project/tools/BLOSUM50"
QIJ="/home/tuba/herpov/tcr-pmhc-sc-project/tools/blosum62.qij"

In [157]:
F1 = "/home/tuba/herpov/tcr-pmhc-sc-project/notebooks/lstS.txt"
F2 = "/home/tuba/herpov/tcr-pmhc-sc-project/notebooks/lstM.txt"

In [158]:
cmd = [seq2score, '-blf', BLF, '-blqij', QIJ, '-pa', F1, F2]

In [159]:
cmd

['/home/tuba/herpov/tcr-pmhc-sc-project/tools/seq2score_db_kernel',
 '-blf',
 '/home/tuba/herpov/tcr-pmhc-sc-project/tools/BLOSUM50',
 '-blqij',
 '/home/tuba/herpov/tcr-pmhc-sc-project/tools/blosum62.qij',
 '-pa',
 '/home/tuba/herpov/tcr-pmhc-sc-project/notebooks/lstM.txt',
 '/home/tuba/herpov/tcr-pmhc-sc-project/notebooks/lstS.txt']

In [146]:
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

In [149]:
result.stdout.decode('utf-8')

''

In [148]:
result.stderr

b'-blf: /Volumes/tuba/herpov/tcr-pmhc-sc-project/tools/seq2score_db_kernel: cannot execute binary file\n'

In [145]:
print(os.path.exists("/Volumes/tuba/herpov/tcr-pmhc-sc-project/tools/seq2score_db_kernel"))

True


In [154]:
def run_cmd(cmd, input_string=''):
        """Run the cmd with input_string as stdin and return output."""
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE,
                                                 stderr=subprocess.PIPE, universal_newlines=True, close_fds=True)
        out, stderr = p.communicate(input=input_string)
        if p.returncode:
                raise Exception('Cmd {} failed: {}'.format(cmd[0], stderr))
        return out

In [160]:
run_cmd(cmd)

FileNotFoundError: [Errno 2] No such file or directory: '/home/tuba/herpov/tcr-pmhc-sc-project/tools/seq2score_db_kernel': '/home/tuba/herpov/tcr-pmhc-sc-project/tools/seq2score_db_kernel'

In [47]:
df.cdr3_TRA.to_csv(m1_a, index=False, header=False)
df.cdr3_TRB.to_csv(m1_b, index=False, header=False)

# Stats

In [16]:
df.cdr3_TRA.drop_duplicates().describe()

count                465
unique               465
top       CAGKEVGSARQLTF
freq                   1
Name: cdr3_TRA, dtype: object

In [24]:
cdr_df = df.groupby(['epitope', 'cdr3_TRA', 'cdr3_TRB']).size().to_frame().rename(columns={0:'gem_count'})

In [39]:
cdr_df['clonotype'] = df.groupby(['epitope', 'cdr3_TRA', 'cdr3_TRB']).clonotype.apply(np.unique)
cdr_df['clonotype_count'] = df.groupby(['epitope', 'cdr3_TRA', 'cdr3_TRB']).clonotype.apply(lambda x: len(np.unique(x)))

In [40]:
cdr_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gem_count,clonotype,clonotype_count
epitope,cdr3_TRA,cdr3_TRB,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CLYBL,CAAASGGSYIPTF,CASSAWTSNRDEQFF,1,[clonotype341],1
CLYBL,CAAKSDSGGGADGLTF,CASRLLAGVINEQFF,2,"[clonotype1107, clonotype931]",2
CLYBL,CAAKSDSGGGADGLTF,CASSAWTSNRDEQFF,90,"[clonotype1, clonotype11, clonotype13, clonoty...",10
CLYBL,CAAKSDSGGGADGLTF,CASSIAPQGWGYGYTF,1,[clonotype536],1
CLYBL,CAAKSDSGGGAEGLTF,CASSAWTSNRDEQFF,1,[clonotype719],1
CLYBL,CAASGRITGGGNKLTF,CASRVGVATEAFF,1,[clonotype90],1
CLYBL,CAFEAGNQFYF,CASRLLAGGQSTQYF,1,[clonotype925],1
CLYBL,CAFEAGNQFYF,CASRLLAGVINEQFF,1,[clonotype58],1
CLYBL,CAFRARVDYGGSQGNLIF,CSARDLLQRGGPYNEQFF,1,[clonotype29],1
CLYBL,CAGGDDYKLSF,CAPPRGREFF,1,[clonotype914],1


In [54]:
df.columns

Index(['gem', 'clonotype', 'cdr3_TRA', 'cdr3_nt_TRA', 'cdr3_TRB',
       'cdr3_nt_TRB', 'umis_TRA', 'umis_diff_TRA', 'single_TRA',
       'umis_lst_TRA', 'cdr3_lst_TRA', 'umis_TRB', 'umis_diff_TRB',
       'single_TRB', 'umis_lst_TRB', 'cdr3_lst_TRB', 'credible_alignment_mhc',
       'credible_alignment_cd8', 'template_id_mhc', 'read_counts_mhc',
       'read_count_diff_mhc', 'single_barcode_mhc', 'read_counts_lst_mhc',
       'template_lst_mhc', 'template_id_cd8', 'read_counts_cd8',
       'read_count_diff_cd8', 'single_barcode_cd8', 'read_counts_lst_cd8',
       'template_lst_cd8', 'detected_response', 'peptide_assayed', 'peptide',
       'HLA', 'epitope', 'epitope_lst', 'CLYBL', 'v13', 'v9', 'num_clonotype',
       'single_tcell', 'umis_tcr', 'peptide_HLA'],
      dtype='object')

In [59]:
df[(df.single_tcell == True) & (df.single_barcode_mhc == True) & (df.clonotype == 'clonotype1')]

Unnamed: 0,gem,clonotype,cdr3_TRA,cdr3_nt_TRA,cdr3_TRB,cdr3_nt_TRB,umis_TRA,umis_diff_TRA,single_TRA,umis_lst_TRA,...,HLA,epitope,epitope_lst,CLYBL,v13,v9,num_clonotype,single_tcell,umis_tcr,peptide_HLA
1,AAACCTGAGGATCGCA-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,10.0,1.0,True,[10],...,A0101,v9,['v9'],,,3.0,1,True,30.0,YSEHPTFTSQY A0101
2,AAACCTGAGGGATGGG-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,10.0,1.0,True,[10],...,A0101,v9,['v9'],,,2.0,1,True,44.0,YSEHPTFTSQY A0101
17,AAACGGGTCAATACCG-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,6.0,1.0,True,[6],...,A0101,v9,['v9'],,,1.0,1,True,22.0,YSEHPTFTSQY A0101
29,AAAGCAAAGCCGTCGT-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,12.0,1.0,True,[12],...,A0101,v9,['v9'],,,1.0,1,True,17.0,YSEHPTFTSQY A0101
30,AAAGCAAAGGTAAACT-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,4.0,1.0,True,[4],...,A0101,v9,['v9'],,,2.0,1,True,24.0,YSEHPTFTSQY A0101
33,AAAGCAAGTAGCAAAT-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,1.0,1.0,True,[1],...,A0101,v9,['v9'],,,1.0,1,True,10.0,YSEHPTFTSQY A0101
35,AAAGCAATCGTGGGAA-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,9.0,1.0,True,[9],...,A0101,v9,['v9'],,,6.0,1,True,32.0,YSEHPTFTSQY A0101
37,AAAGTAGAGCACACAG-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,4.0,1.0,True,[4],...,A0101,v9,['v9'],,,4.0,1,True,21.0,YSEHPTFTSQY A0101
42,AAAGTAGGTGTGTGCC-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,16.0,1.0,True,[16],...,A0101,v9,['v9'],,,5.0,1,True,50.0,YSEHPTFTSQY A0101
45,AAATGCCAGAATCTCC-1,clonotype1,CAAKSDSGGGADGLTF,TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTCACCTTT,CASSAWTSNRDEQFF,TGTGCCAGCAGCGCCTGGACTAGTAATCGGGATGAGCAGTTCTTC,6.0,1.0,True,[6],...,A0101,v9,['v9'],,,4.0,1,True,25.0,YSEHPTFTSQY A0101


In [64]:
df[df.clonotype == 'clonotype1'].groupby(['epitope']).apply(np.mean)

Unnamed: 0_level_0,umis_TRA,umis_diff_TRA,umis_TRB,umis_diff_TRB,credible_alignment_mhc,credible_alignment_cd8,read_counts_mhc,read_count_diff_mhc,single_barcode_mhc,read_counts_cd8,read_count_diff_cd8,single_barcode_cd8,detected_response,peptide_assayed,CLYBL,v13,v9,num_clonotype,single_tcell,umis_tcr
epitope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
CLYBL,9.8125,0.987237,27.9625,0.987738,1.0,1.0,1.3875,0.501875,0.4375,216.0375,0.781837,0.1125,,1.0,1.3875,,,1.0,0.85,37.775
v13,9.065217,0.994087,24.543478,0.988457,1.0,1.0,1.369565,0.398043,0.369565,53.282609,0.788326,0.195652,,1.0,,1.369565,,1.0,0.869565,33.608696
v9,8.792035,0.990702,26.013274,0.983897,1.0,1.0,2.713654,0.963067,0.903919,65.342188,0.811047,0.094877,,1.0,,,2.713654,1.0,0.848925,34.80531
