The following script can only be run if pre-processed data is available. Otherwise, start from apply_stitchr_tanno.py

In [None]:
import pandas as pd
import pyrepseq as prs
import os
import numpy as np
os.chdir('..')

In [2]:
f = '/path/to/folder/'

os.listdir(f)

['A1 memory.txt',
 'A1 naive.txt',
 'A2 memory.txt',
 'A2 naive.txt',
 'B1 memory.txt',
 'B1 naive.txt',
 'B2 memory.txt',
 'B2 naive.txt',
 'C1 memory.txt',
 'C1 naive.txt',
 'C2 memory.txt',
 'C2 naive.txt',
 'D1 replicate.txt',
 'D1.txt',
 'D2 replicate.txt',
 'D2.txt',
 'E1 replicate.txt',
 'E1.txt',
 'E2 replicate.txt',
 'E2.txt',
 'F1.txt',
 'F2.txt',
 'X replicate.txt',
 'X.txt',
 'Y replicate.txt',
 'Y.txt',
 'Z replicate.txt',
 'Z.txt']

The following function allows me to do the same preprocessing as in Mayer and Callan, 2023. 

This cleans up the dataset by: (1) removing invariant, (2) removing overlap between memory and naive and (3) removing identical nucleotide sequences, which are unlikely to have occurred and are probably a by-product of the PCR protocol.

Function from: https://github.com/andim/paper_coincidences/blob/main/scripts/process_tanno_pruning.py

In [3]:
def df_pruning(f, myfile):
    # sort by clone sizes
    df = pd.read_csv(f + myfile, sep = '\t')
    print('Initial size: ', df.shape)
    df = df.sort_values('Clustered', ascending=False)

    valid_light = df['CDRL3_AA'].apply(prs.isvalidcdr3)
    valid_heavy = df['CDRH3_AA'].apply(prs.isvalidcdr3)
    before = len(df)
    df = df[valid_light & valid_heavy]
    print(before, np.sum(~valid_light), np.sum(~valid_heavy), len(df))

    # what prunings will we do?
    invariants = True
    memory_naive = True
    unique_nt = True

    if invariants:
        # now remove the invariant cells, defined by the following alpha chain V/J pairs
        invariant = [('TRAV1-2', 'TRAJ33'), 
                    ('TRAV1-2', 'TRAJ12'),
                    ('TRAV1-2', 'TRAJ20'),
                    ('TRAV10', 'TRAJ18')]
        invariant_joined = [s1+'_'+s2 for s1,s2 in invariant]

        mask = (df['VL'] + '_' + df['JL']).isin(invariant_joined)
        df = df[~mask]
        print('invariant filtered', sum(mask))

    if memory_naive:
        if 'naive' in myfile:
            dfmem = pd.read_csv(f + myfile.replace('naive', 'memory'), sep='\t')
            A = df['CDRH3_AA'] + '_' + df['CDRL3_AA']
            B = dfmem['CDRH3_AA'] + '_' + dfmem['CDRL3_AA']
            mask = A.isin(B)
            df = df[~mask]
            print('memory overlap filtered', sum(mask))

    if unique_nt:
        before = len(df)
        df = df.drop_duplicates('CDRH3_NT', keep='first')
        df = df.drop_duplicates('CDRL3_NT', keep='first')
        print('nt duplicates filtered', before-len(df))
    
    print('After preprocessing: ', df.shape)
    
    return(df)

In [4]:
alldf = []

for myfile in os.listdir(f):
    print(myfile)    
    df = df_pruning(f, myfile)
    df['subject'] = myfile[0:2].strip(' .')
    df['sample'] = myfile.strip('.txt')
    alldf.append(df)

A1 memory.txt
Initial size:  (28148, 12)
28148 212 72 27864
invariant filtered 136
nt duplicates filtered 2852
After preprocessing:  (24876, 12)
A1 naive.txt
Initial size:  (17274, 12)
17274 159 53 17063
invariant filtered 18
memory overlap filtered 48
nt duplicates filtered 2098
After preprocessing:  (14899, 12)
A2 memory.txt
Initial size:  (30121, 12)
30121 252 82 29788
invariant filtered 158
nt duplicates filtered 4242
After preprocessing:  (25388, 12)
A2 naive.txt
Initial size:  (42335, 12)
42335 487 156 41694
invariant filtered 53
memory overlap filtered 213
nt duplicates filtered 5510
After preprocessing:  (35918, 12)
B1 memory.txt
Initial size:  (25422, 12)
25422 146 32 25244
invariant filtered 35
nt duplicates filtered 2809
After preprocessing:  (22400, 12)
B1 naive.txt
Initial size:  (46343, 12)
46343 305 100 45940
invariant filtered 37
memory overlap filtered 159
nt duplicates filtered 5100
After preprocessing:  (40644, 12)
B2 memory.txt
Initial size:  (37531, 12)
37531 318 9

In [5]:
DF = pd.concat(alldf)

In [6]:
DF.to_csv('data/Tanno_combined.csv.gz')