In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [3]:
df = pd.read_csv("src/clonotypes.csv", header=0) # ["clonotype_id", "frequency", "proportion", "cdr3s_aa_A", "cdr3s_aa_B", "cdr3_nt_A", "cdr3_nt_B"]

In [5]:
pmhc_barcodes = np.loadtxt("/Volumes/tuba/herpov/tcr-pmhc-sc-project/data/exp3_MHC/processed/longranger_clean/tmp/unique_gems_min_2.lst", dtype=np.str)

In [6]:
df.head()

Unnamed: 0,clonotype_id,frequency,proportion,cdr3s_aa,cdr3s_nt
0,clonotype1,461,0.109112,TRA:CAASNLVF;TRB:CASSLVVVDEQFF,TRA:TGTGCAGCAAGTAATCTGGTCTTT;TRB:TGTGCCAGCAGCT...
1,clonotype2,333,0.078817,TRA:CAAKSDSGGGADGLTF;TRB:CASSAWTSNRDEQFF,TRA:TGTGCAGCAAAATCGGATTCAGGAGGAGGTGCTGACGGACTC...
2,clonotype3,125,0.029586,TRA:CALYNTDKLIF;TRB:CASSPTSGSVYEQYF,TRA:TGTGCTCTTTATAACACCGACAAGCTCATCTTT;TRB:TGTG...
3,clonotype4,86,0.020355,TRA:CVYNQGGKLIF;TRB:CASSQARDPTGELFF,TRA:TGTGTTTATAACCAGGGAGGAAAGCTTATCTTC;TRB:TGTG...
4,clonotype5,73,0.017278,TRA:CAYRSEWRDDKIIF;TRB:CASSPPIAGGPYNEQFF,TRA:TGTGCTTATAGGAGCGAGTGGAGAGATGACAAGATCATCTTT...


In [7]:
pmhc_barcodes

array(['AAACCTGAGAATCTCC-1', 'AAACCTGAGAATTGTG-1', 'AAACCTGAGACAGGCT-1',
       ..., 'TTTGTCATCTTAACCT-1', 'TTTGTCATCTTCGGTC-1',
       'TTTGTCATCTTTAGTC-1'], dtype='<U18')

# Prepare data

Raw data consists of columns with multiple sequences of TRAs and TRBs. These sequences are split to form new columns. The number of sequences is not constant and therefore some of the produced columns may contain None values.

In [None]:
cdr3s_aa = df["cdr3s_aa"].str.split(";", n = -1, expand = True) # OBS! also split on tab?!
cdr3s_nt = df["cdr3s_nt"].str.split(";", n = -1, expand = True)

In [None]:
print("Raw DF dimensions: ", df.shape)
print("CDR3s_aa dimensions: ", cdr3s_aa.shape)
print("CDR3s_nt dimensions: ", cdr3s_nt.shape)

In [None]:
cdr3s_aa.columns=['aa0', 'aa1', 'aa2', 'aa3', 'aa4', 'aa5']
cdr3s_nt.columns=['nt0', 'nt1', 'nt2', 'nt3', 'nt4', 'nt5']

In [None]:
df = pd.concat([df, cdr3s_aa, cdr3s_nt], axis=1)
df.drop(labels=["cdr3s_aa", "cdr3s_nt"], axis="columns", inplace=True)
df.head()

## Variables

In [None]:
aa_columns = list(range(3, 9))
nt_columns = list(range(9, 15))

# Counts of alpha & beta chains (aa) registered per clonotype

Count the number of clonotypes represented with different number of TRA and TRBs.
The counts are gathered in a dataframe df_counts.
The rows represent different numbers of TRAs.
The columns represent different numbers of TRBs.

In [None]:
df_counts = pd.DataFrame(0, columns=[0,1,2,3,4,5], index=[0,1,2,3,4,5])

# Collect indexes of clonotypes that have missing chains
index_TRA = list()
index_TRB = list()

# NB! Assumption of column indexes
for i in range(df.shape[0]):
    count_TRA = sum(df.iloc[i][aa_columns].str.startswith('TRA:', na=False))
    count_TRB = sum(df.iloc[i][aa_columns].str.startswith('TRB:', na=False))
    df_counts.iloc[count_TRA, count_TRB] += 1
    
    if count_TRA == 0 and count_TRB:
        index_TRB.append(i)
    elif count_TRB == 0 and count_TRA:
        index_TRA.append(i)

In [None]:
df_counts.columns = ['0 TRBs', '1 TRB', '2 TRBs', '3 TRBs', '4 TRBs', '5 TRBs']
df_counts.index = ['0 TRAs', '1 TRA', '2 TRAs', '3 TRAs', '4 TRAs', '5 TRAs']
df_counts

In [None]:
df_counts.plot.bar()
plt.show()

# Number of clonotypes with missing alpha chain and recursive beta chain

In [None]:
df.iloc[index_TRA].head()

In [None]:
recurrent_indexes = list()

for i in index_TRA:
    cdr3_seqs = df.iloc[i, nt_columns][df.iloc[i, nt_columns].str.startswith('TRA:', na=False)].values
    for nt_seq in cdr3_seqs:
        assert nt_seq[0:4] == 'TRA:' and len(nt_seq) > 4, nt_seq[0:4]
        remaining_clones = df.iloc[[remaining_element for remaining_element in list(df.index) if remaining_element not in index_TRA]]
        recurrent_indexes.append(list(remaining_clones[remaining_clones.eq(nt_seq).any(1)].index))
recurrent_indexes

# Number of clonotypes with missing beta chain and recursive alpha chain

In [None]:
lrange(2)

# Trials and errors

# Counts of alpha chains (AA) registered per clonotype

TRA_aa_0 = sum( ~df['aa0'].str.startswith('TRA', na=False) & ~df['aa1'].str.startswith('TRA', na=False) & ~df['aa2'].str.startswith('TRA', na=False) & ~df['aa3'].str.startswith('TRA', na=False) & ~df['aa4'].str.startswith('TRA', na=False) & ~df['aa5'].str.startswith('TRA', na=False))
TRA_aa_1 = sum( df['aa0'].str.startswith('TRA', na=False) & ~df['aa1'].str.startswith('TRA', na=False) & ~df['aa2'].str.startswith('TRA', na=False) & ~df['aa3'].str.startswith('TRA', na=False) & ~df['aa4'].str.startswith('TRA', na=False) & ~df['aa5'].str.startswith('TRA', na=False))
TRA_aa_2 = sum( df['aa0'].str.startswith('TRA', na=False) & df['aa1'].str.startswith('TRA', na=False) & ~df['aa2'].str.startswith('TRA', na=False) & ~df['aa3'].str.startswith('TRA', na=False) & ~df['aa4'].str.startswith('TRA', na=False) & ~df['aa5'].str.startswith('TRA', na=False))
TRA_aa_3 = sum( df['aa0'].str.startswith('TRA', na=False) & df['aa1'].str.startswith('TRA', na=False) & df['aa2'].str.startswith('TRA', na=False) & ~df['aa3'].str.startswith('TRA', na=False) & ~df['aa4'].str.startswith('TRA', na=False) & ~df['aa5'].str.startswith('TRA', na=False))
TRA_aa_4 = sum( df['aa0'].str.startswith('TRA', na=False) & df['aa1'].str.startswith('TRA', na=False) & df['aa2'].str.startswith('TRA', na=False) & df['aa3'].str.startswith('TRA', na=False) & ~df['aa4'].str.startswith('TRA', na=False) & ~df['aa5'].str.startswith('TRA', na=False))
TRA_aa_5 = sum( df['aa0'].str.startswith('TRA', na=False) & df['aa1'].str.startswith('TRA', na=False) & df['aa2'].str.startswith('TRA', na=False) & df['aa3'].str.startswith('TRA', na=False) & df['aa4'].str.startswith('TRA', na=False) & ~df['aa5'].str.startswith('TRA', na=False))
TRA_aa_6 = sum( df['aa0'].str.startswith('TRA', na=False) & df['aa1'].str.startswith('TRA', na=False) & df['aa2'].str.startswith('TRA', na=False) & df['aa3'].str.startswith('TRA', na=False) & df['aa4'].str.startswith('TRA', na=False) & df['aa5'].str.startswith('TRA', na=False))

print("# Clonotypes with 0 alpha chain: %d" %TRA_aa_0)
print("# Clonotypes with 1 alpha chain: %d" %TRA_aa_1)
print("# Clonotypes with 2 alpha chain: %d" %TRA_aa_2)
print("# Clonotypes with 3 alpha chain: %d" %TRA_aa_3)
print("# Clonotypes with 4 alpha chain: %d" %TRA_aa_4)
print("# Clonotypes with 5 alpha chain: %d" %TRA_aa_5)
print("# Clonotypes with 6 alpha chain: %d" %TRA_aa_6)

# Counts of beta chains (AA) registered per clonotype

TRB_aa_0 = sum( ~df['aa0'].str.startswith('TRB', na=False) & ~df['aa1'].str.startswith('TRB', na=False) & ~df['aa2'].str.startswith('TRB', na=False) & ~df['aa3'].str.startswith('TRB', na=False) & ~df['aa4'].str.startswith('TRB', na=False) & ~df['aa5'].str.startswith('TRB', na=False))
TRB_aa_1 = sum( df['aa0'].str.startswith('TRB', na=False) & ~df['aa1'].str.startswith('TRB', na=False) & ~df['aa2'].str.startswith('TRB', na=False) & ~df['aa3'].str.startswith('TRB', na=False) & ~df['aa4'].str.startswith('TRB', na=False) & ~df['aa5'].str.startswith('TRB', na=False))
TRB_aa_2 = sum( df['aa0'].str.startswith('TRB', na=False) & df['aa1'].str.startswith('TRB', na=False) & ~df['aa2'].str.startswith('TRB', na=False) & ~df['aa3'].str.startswith('TRB', na=False) & ~df['aa4'].str.startswith('TRB', na=False) & ~df['aa5'].str.startswith('TRB', na=False))
TRB_aa_3 = sum( df['aa0'].str.startswith('TRB', na=False) & df['aa1'].str.startswith('TRB', na=False) & df['aa2'].str.startswith('TRB', na=False) & ~df['aa3'].str.startswith('TRB', na=False) & ~df['aa4'].str.startswith('TRB', na=False) & ~df['aa5'].str.startswith('TRB', na=False))
TRB_aa_4 = sum( df['aa0'].str.startswith('TRB', na=False) & df['aa1'].str.startswith('TRB', na=False) & df['aa2'].str.startswith('TRB', na=False) & df['aa3'].str.startswith('TRB', na=False) & ~df['aa4'].str.startswith('TRB', na=False) & ~df['aa5'].str.startswith('TRB', na=False))
TRB_aa_5 = sum( df['aa0'].str.startswith('TRB', na=False) & df['aa1'].str.startswith('TRB', na=False) & df['aa2'].str.startswith('TRB', na=False) & df['aa3'].str.startswith('TRB', na=False) & df['aa4'].str.startswith('TRB', na=False) & ~df['aa5'].str.startswith('TRB', na=False))
TRB_aa_6 = sum( df['aa0'].str.startswith('TRB', na=False) & df['aa1'].str.startswith('TRB', na=False) & df['aa2'].str.startswith('TRB', na=False) & df['aa3'].str.startswith('TRB', na=False) & df['aa4'].str.startswith('TRB', na=False) & df['aa5'].str.startswith('TRB', na=False))

print("# Clonotypes with 0 beta chain: %d" %TRB_aa_0)
print("# Clonotypes with 1 beta chain: %d" %TRB_aa_1)
print("# Clonotypes with 2 beta chain: %d" %TRB_aa_2)
print("# Clonotypes with 3 beta chain: %d" %TRB_aa_3)
print("# Clonotypes with 4 beta chain: %d" %TRB_aa_4)
print("# Clonotypes with 5 beta chain: %d" %TRB_aa_5)
print("# Clonotypes with 6 beta chain: %d" %TRB_aa_6)

# Counts of alpha chains (nt) registered per clonotype

TRA_nt_0 = sum( ~df['nt0'].str.startswith('TRA', na=False) & ~df['nt1'].str.startswith('TRA', na=False) & ~df['nt2'].str.startswith('TRA', na=False) & ~df['nt3'].str.startswith('TRA', na=False) & ~df['nt4'].str.startswith('TRA', na=False) & ~df['nt5'].str.startswith('TRA', na=False))
TRA_nt_1 = sum( df['nt0'].str.startswith('TRA', na=False) & ~df['nt1'].str.startswith('TRA', na=False) & ~df['nt2'].str.startswith('TRA', na=False) & ~df['nt3'].str.startswith('TRA', na=False) & ~df['nt4'].str.startswith('TRA', na=False) & ~df['nt5'].str.startswith('TRA', na=False))
TRA_nt_2 = sum( df['nt0'].str.startswith('TRA', na=False) & df['nt1'].str.startswith('TRA', na=False) & ~df['nt2'].str.startswith('TRA', na=False) & ~df['nt3'].str.startswith('TRA', na=False) & ~df['nt4'].str.startswith('TRA', na=False) & ~df['nt5'].str.startswith('TRA', na=False))
TRA_nt_3 = sum( df['nt0'].str.startswith('TRA', na=False) & df['nt1'].str.startswith('TRA', na=False) & df['nt2'].str.startswith('TRA', na=False) & ~df['nt3'].str.startswith('TRA', na=False) & ~df['nt4'].str.startswith('TRA', na=False) & ~df['nt5'].str.startswith('TRA', na=False))
TRA_nt_4 = sum( df['nt0'].str.startswith('TRA', na=False) & df['nt1'].str.startswith('TRA', na=False) & df['nt2'].str.startswith('TRA', na=False) & df['nt3'].str.startswith('TRA', na=False) & ~df['nt4'].str.startswith('TRA', na=False) & ~df['nt5'].str.startswith('TRA', na=False))
TRA_nt_5 = sum( df['nt0'].str.startswith('TRA', na=False) & df['nt1'].str.startswith('TRA', na=False) & df['nt2'].str.startswith('TRA', na=False) & df['nt3'].str.startswith('TRA', na=False) & df['nt4'].str.startswith('TRA', na=False) & ~df['nt5'].str.startswith('TRA', na=False))
TRA_nt_6 = sum( df['nt0'].str.startswith('TRA', na=False) & df['nt1'].str.startswith('TRA', na=False) & df['nt2'].str.startswith('TRA', na=False) & df['nt3'].str.startswith('TRA', na=False) & df['nt4'].str.startswith('TRA', na=False) & df['nt5'].str.startswith('TRA', na=False))

print("# Clonotypes with 0 alpha chain: %d" %TRA_nt_0)
print("# Clonotypes with 1 alpha chain: %d" %TRA_nt_1)
print("# Clonotypes with 2 alpha chain: %d" %TRA_nt_2)
print("# Clonotypes with 3 alpha chain: %d" %TRA_nt_3)
print("# Clonotypes with 4 alpha chain: %d" %TRA_nt_4)
print("# Clonotypes with 5 alpha chain: %d" %TRA_nt_5)
print("# Clonotypes with 6 alpha chain: %d" %TRA_nt_6)

# Number of beta chains (nt) registered in each clonotype

TRB_nt_0 = sum( ~df['nt0'].str.startswith('TRB', na=False) & ~df['nt1'].str.startswith('TRB', na=False) & ~df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False))
TRB_nt_1 = sum( df['nt0'].str.startswith('TRB', na=False) & ~df['nt1'].str.startswith('TRB', na=False) & ~df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False))
TRB_nt_2 = sum( df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & ~df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False))
TRB_nt_3 = sum( df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False))
TRB_nt_4 = sum( df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False))
TRB_nt_5 = sum( df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & df['nt3'].str.startswith('TRB', na=False) & df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False))
TRB_nt_6 = sum( df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & df['nt3'].str.startswith('TRB', na=False) & df['nt4'].str.startswith('TRB', na=False) & df['nt5'].str.startswith('TRB', na=False))

TRB_nt_0 = ~df['nt0'].str.startswith('TRB', na=False) & ~df['nt1'].str.startswith('TRB', na=False) & ~df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False)
TRB_nt_1 = df['nt0'].str.startswith('TRB', na=False) & ~df['nt1'].str.startswith('TRB', na=False) & ~df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False)
TRB_nt_2 = df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & ~df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False)
TRB_nt_3 = df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & ~df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False)
TRB_nt_4 = df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & df['nt3'].str.startswith('TRB', na=False) & ~df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False)
TRB_nt_5 = df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & df['nt3'].str.startswith('TRB', na=False) & df['nt4'].str.startswith('TRB', na=False) & ~df['nt5'].str.startswith('TRB', na=False)
TRB_nt_6 = df['nt0'].str.startswith('TRB', na=False) & df['nt1'].str.startswith('TRB', na=False) & df['nt2'].str.startswith('TRB', na=False) & df['nt3'].str.startswith('TRB', na=False) & df['nt4'].str.startswith('TRB', na=False) & df['nt5'].str.startswith('TRB', na=False)

print("# Clonotypes with 0 beta chain: %d" %sum(TRB_nt_0))
print("# Clonotypes with 1 beta chain: %d" %sum(TRB_nt_1))
print("# Clonotypes with 2 beta chain: %d" %sum(TRB_nt_2))
print("# Clonotypes with 3 beta chain: %d" %sum(TRB_nt_3))
print("# Clonotypes with 4 beta chain: %d" %sum(TRB_nt_4))
print("# Clonotypes with 5 beta chain: %d" %sum(TRB_nt_5))
print("# Clonotypes with 6 beta chain: %d" %sum(TRB_nt_6))

In [None]:
import functools
mask = functools.reduce(np.logical_or, [df['aa{}'.format(i)].str.startswith('TRA:', na=False) for i in range(6)])
df.loc[mask].head()

In [None]:
# assuming AA seqs are in column 3-8
sum(df.iloc[i][list(range(3,9))].str.startswith('TRA:', na=False))

In [None]:
cdr3s_aa.head()

In [None]:
cdr3s_aa[~cdr3s_aa.isnull()].tail()

df["cdr3s_aa_A"] = cdr3s_aa[0].str.split(":", n = 1, expand = True)[1]
df["cdr3s_aa_B"] = cdr3s_aa[1].str.split(":", n = 1, expand = True)[1]

df["cdr3s_nt_A"] = cdr3s_nt[0].str.split(":", n = 1, expand = True)[1]
df["cdr3s_nt_B"] = cdr3s_nt[1].str.split(":", n = 1, expand = True)[1]

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
print("# Entries:\t\t\t %d" %df.shape[0])
print("# Unique clonotypes:\t\t %d" %df["clonotype_id"].unique().shape[0])
print("# Unique alpha CDR3 (AA):\t %d" %df["cdr3s_aa_A"].unique().shape[0])
print("# Unique beta CDR3 (AA):\t %d" %df["cdr3s_aa_B"].unique().shape[0])
print("# Unique alpha CDR3 (nt):\t %d" %df["cdr3s_nt_A"].unique().shape[0])
print("# Unique beta CDR3 (nt):\t %d" %df["cdr3s_nt_B"].unique().shape[0])

In [None]:
print("# Missing alpha:\t\t %d" %sum(df["cdr3s_aa_A"].isnull()))
print("# Missing alpha:\t\t %d" %sum(df["cdr3s_nt_A"].isnull()))
print("# Missing beta:\t\t %d" %sum(df["cdr3s_aa_B"].isnull()))
print("# Missing beta:\t\t %d" %sum(df["cdr3s_nt_B"].isnull()))

In [None]:
print("# recurrent TCA nt seqs:\t %d" %sum(df[df["cdr3s_aa_B"].isnull()]["cdr3s_nt_A"].isin(df[~df["cdr3s_aa_B"].isnull()]["cdr3s_nt_A"].unique())))
print("# recurrent TCA AA seqs:\t %d" %sum(df[df["cdr3s_aa_B"].isnull()]["cdr3s_aa_A"].isin(df[~df["cdr3s_aa_B"].isnull()]["cdr3s_aa_A"].unique())))

      

In [None]:
df['count_A_reoccurrence'] = df.groupby(['cdr3s_nt_A']).size().reset_index(name='count_A_reoccurrence')['count_A_reoccurrence']
df['count_B_reoccurrence'] = df.groupby(['cdr3s_nt_B']).size().reset_index(name='count_B_reoccurrence')['count_B_reoccurrence']

In [None]:
df.head()

In [None]:
df.sort_values('count_A_reoccurrence', ascending = False).head(40)

In [None]:
test = pd.DataFrame({'A':['a','b'], 'B':['a','d']})
test


In [None]:
test[test.eq('d').any(1)]

In [None]:
test['A'].isin(test['B'])

In [None]:
test.iloc[[remaining_element for remaining_element in list(test.index) if remaining_element not in [1]]]

In [None]:
test.iloc[lambda x: x.index ]