In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('..')
import functions.myfunctions as mf
from itertools import combinations
from sklearn.metrics import mutual_info_score as MI
import tidytcells as tt
import seaborn as sns
import matplotlib.pyplot as plt

MI between cdr3s

In [6]:
def cdr3_mi(a, b):
    mi = np.zeros(shape=(a.shape[1],b.shape[1]))                
    for i in range(a.shape[1]):
        for j in range(b.shape[1]):
            mi[i,j] = MI(a[:,i], b[:,j])
    
    return(mi)

In [None]:
def save_cdr3_MI(mydict):
    mutualInfo = {}
    # print(mydict)
    # I want to save both the matrix and the sum
    for key in mydict.keys(): # either ep or ep + '_shuffle'
        mutualInfo[key] = {}
        for ss in mydict[key].keys():
            # and calculate the sum of each
            sumlist = [x.sum() for x in mydict[key][ss]]
            mutualInfo[key][ss] = sumlist
    r = pd.DataFrame(mutualInfo).T
    r['type'] = 'cdr3a-cdr3b'
    
    ep = set([k.split('_')[0] for k in mydict])
    assert len(ep) == 1
    ep = list(ep)[0]

    midf = pd.DataFrame()

    # save all arrays
    for i in mydict[ep].keys():
        assert len(mydict[ep][i]) == 10
        mean_array = np.stack([np.array(x).ravel() for x in mydict[ep][i]], axis=0)
        mean_shuffle = np.stack([np.array(x).ravel() for x in mydict[ep + '_shuffle'][i]], axis=0)
        assert mean_array.shape == (len(mydict[ep][i]), mydict[ep][i][0].shape[0]*mydict[ep][i][0].shape[1])
        assert mean_shuffle.shape == (len(mydict[ep][i]), mydict[ep + '_shuffle'][i][0].shape[0]*mydict[ep + '_shuffle'][i][0].shape[1])

        meandf = pd.DataFrame(mean_array)
        meandf['shuffle'] = 'real'
        meandf['subsample'] = i
        meandf['epitope'] = ep
        
        shuffledf = pd.DataFrame(mean_shuffle)
        shuffledf['shuffle'] = 'shuffle'
        shuffledf['subsample'] = i
        shuffledf['epitope'] = ep
        
        midf = pd.concat([midf, meandf, shuffledf])

    midf.to_csv('data/output/mutual_info/mutual_info_cdr3ab_array_' + ep + '_subsamples_endpadding.csv')

    return(r)

MI with V, J etc

In [12]:
def cdr3_with_cat_MI(cat_series, s):
    mi = []
    for i in range(s.shape[1]):
        mi.append(MI(cat_series, s[:,i]))
    return(np.array(mi))

In [13]:
def save_gene_MI(mydict, R, typestring):

    r = pd.DataFrame(mydict).T
    r['type'] = typestring
    R = pd.concat([R, r])

    return(R)

In [14]:
def save_gene_cdr3_MI(mydict, R, MIDF, typestring, epitope):
    mutualInfo = {}
    # I want to save both the matrix and the sum
    for key in mydict.keys(): # either ep or ep + '_shuffle'
        mutualInfo[key] = {}
        for ss in mydict[key].keys():
            # and calculate the sum of each
            sumlist = [x.sum() for x in mydict[key][ss]]
            mutualInfo[key][ss] = sumlist
    r = pd.DataFrame(mutualInfo).T
    r['type'] = typestring
    R = pd.concat([R, r])

    ep = set([k.split('_')[0] for k in mydict])
    assert len(ep) == 1
    ep = list(ep)[0]
    # save all array
    for ss in mydict[ep].keys():
        assert len(mydict[ep][ss]) == 10
        mean_array = np.stack([np.array(x).ravel() for x in mydict[ep][ss]], axis=0)
        mean_shuffle = np.stack([np.array(x).ravel() for x in mydict[ep + '_shuffle'][ss]], axis=0)
        assert mean_array.shape == (len(mydict[ep][ss]), mydict[ep][ss][0].shape[0])
        assert mean_shuffle.shape == (len(mydict[ep + '_shuffle'][ss]), mydict[ep + '_shuffle'][ss][0].shape[0])

        meandf = pd.DataFrame(mean_array)
        meandf['epitope'] = epitope
        meandf['subsample'] = ss
        meandf_shuffle = pd.DataFrame(mean_shuffle)
        meandf_shuffle['epitope'] = epitope
        meandf_shuffle['shuffle'] = 'shuffle'
        meandf_shuffle['subsample'] = ss
        MIDF = pd.concat([MIDF, meandf, meandf_shuffle])

    return(R, MIDF)

In [20]:
vdj = pd.read_csv('data/vdj_cleaned_subset_for_MI.csv', index_col = 0)
vdj = vdj.loc[vdj['Epitope'] != 'KLGGALQAK'] # because too big - takes forever

vdj['cdr3a'] = vdj['cdr3a_IMGTgaps'].str.replace('-','')
vdj['len_cdr3a_nogaps'] = [len(x) for x in vdj['cdr3a']]
vdj['cdr3b'] = vdj['cdr3b_IMGTgaps'].str.replace('-','')
vdj['len_cdr3b_nogaps'] = [len(x) for x in vdj['cdr3b']]

vdj['cdr3a_endpadding'] = [x + ''.join(['-']*(19-len(x))) for x in vdj['cdr3a']]
vdj['cdr3b_endpadding'] = [x + ''.join(['-']*(19-len(x))) for x in vdj['cdr3b']]

vdj = mf.prepare_data(vdj, col1 = 'cdr3a_endpadding', col2='cdr3b_endpadding', type = 'cdr3').drop_duplicates(subset = ['cdr3a_endpadding_padded', 'cdr3b_endpadding_padded', 'Epitope'])
print(vdj.shape)

print(vdj['cdr3a_endpadding'])
print(vdj['cdr3b_endpadding'])

assert all([len(x) == 19 for x in vdj['cdr3a_endpadding']])
assert all([len(x) == 19 for x in vdj['cdr3b_endpadding']])

(9691, 43)
0        CAYTVLGNEKLTF------
1        CAVAGYGGSQGNLIF----
2        CAVSFGNEKLTF-------
3        CAVTHYGGSQGNLIF----
4        CAGGGGGADGLTF------
                ...         
24520    CAVMRGTALIF--------
24521    CAVPTYSSASKIIF-----
24522    CALSATSGTYKYIF-----
24523    CAALAGTASKLTF------
24524    CADPNTGNQFYF-------
Name: cdr3a_endpadding, Length: 9691, dtype: object
0        CASSFTPYNEQFF------
1        CASSPQGLGTEAFF-----
2        CAEGQGFVGQPQHF-----
3        CASLRSAVWADTQYF----
4        CASTLTGLGQPQHF-----
                ...         
24520    CSVVGTGGPETQYF-----
24521    CASSSGGYEQYF-------
24522    CASSPLSGTSATKETQYF-
24523    CSVVPLAGPYEQYF-----
24524    CASSRRWGSSYEQYF----
Name: cdr3b_endpadding, Length: 9691, dtype: object


In [21]:
for c in ['V-a', 'V-b', 'J-a', 'J-b']:
    print(c)
    vdj[c] = vdj.apply(
        lambda row: pd.NA if type(row[c]) != str else tt.tcr.standardise(
            gene_name=row[c],
            species=row['Species'],
            precision='gene'
        ),
        axis=1
    )
vdj00 = vdj.dropna(subset=['V-a', 'V-b', 'J-a', 'J-b'])
print('shape before tidytcells: ', vdj.shape, '; shape after tidytcells: ', vdj00.shape)
vdj = vdj00.copy()

V-a
V-b
J-a
J-b
shape before tidytcells:  (9691, 43) ; shape after tidytcells:  (9691, 43)


In [22]:
vdj00['Epitope'].value_counts()

GILGFVFTL     1853
AVFDRKSDAK    1699
RAKFKQLL      1195
IVTDFSVIK      704
RLRAEAQVK      412
ELAGIGILTV     380
NLVPMVATV      353
SSLENFRAYV     348
GLCTLVAML      343
YLQPRTFLL      333
TTDPSFLGRY     242
HGIRNASFI      240
LLWNGPMAV      235
CINGVCWTV      226
ASNENMETM      187
SSYRRPVGI      177
SPRWYFYYL      175
LSLRNPILV      127
SSPPMFRV       125
LTDEMIAQY      124
ATDALMTGF      123
KSKRTPMGF       90
Name: Epitope, dtype: int64

In [25]:
all_MI = {}

In [26]:
# for each epitope
for ep in vdj['Epitope'].unique():
    mutualInfo = {}
    mutualInfo[ep] = {}
    mutualInfo[ep + '_shuffle'] = {}
    epset = vdj.loc[vdj['Epitope'] == ep]
    print(ep, epset.shape[0])
    i = 0
    s_size = subsamples[0]
    while s_size < epset.shape[0]:
        mutualInfo[ep][s_size] = []
        mutualInfo[ep + '_shuffle'][s_size] = []
        # print(s_size)
        for j in range(10):
            # print(j)
            epset_sample = epset.sample(s_size)
            epset_sample1 = epset_sample.sample(s_size)
            a = np.array(epset_sample.cdr3a_endpadding_padded.str.split(', ', expand=True))
            b = np.array(epset_sample.cdr3b_endpadding_padded.str.split(', ', expand=True))
            b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))

            mi = cdr3_mi(a,b)
            shuffle_mi = cdr3_mi(a, b_s)

            mutualInfo[ep][s_size].append(mi)
            mutualInfo[ep + '_shuffle'][s_size].append(shuffle_mi)
        i += 1
        s_size = subsamples[i]
    
    # finally calculate on entire set
    s_size = epset.shape[0]
    mutualInfo[ep][s_size] = []
    mutualInfo[ep + '_shuffle'][s_size] = []
    for j in range(10):
        # putting also real calcs in here because I want 10 points for the fitting
        a = np.array(epset.cdr3a_endpadding_padded.str.split(', ', expand=True))
        b = np.array(epset.cdr3b_endpadding_padded.str.split(', ', expand=True))    
        mi = cdr3_mi(a,b)
        mutualInfo[ep][s_size].append(mi)
        epset_sample1 = epset.sample(epset.shape[0])
        b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))
        mutualInfo[ep + '_shuffle'][s_size].append(cdr3_mi(a, b_s))
    
    print(mutualInfo[ep].keys())
    r = save_cdr3_MI(mutualInfo)
    all_MI[ep] = r

ELAGIGILTV 380
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 380])
GILGFVFTL 1853
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 500, 1000, 1500, 1853])
NLVPMVATV 353
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 353])
GLCTLVAML 343
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 343])
SSLENFRAYV 348
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 348])
ASNENMETM 187
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 187])
YLQPRTFLL 333
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 333])
LLWNGPMAV 235
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 235])
CINGVCWTV 226
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 226])
KSKRTPMGF 90
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 90])
ATDALMTGF 123
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 123])
AVFDRKSDAK 1699
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 500, 1000, 1500, 169

In [27]:
# for background

ep = 'background'
mutualInfo = {}
mutualInfo[ep] = {}
mutualInfo[ep + '_shuffle'] = {}
epset = vdj.copy()
print(ep, epset.shape[0])
i = 0
s_size = subsamples[0]
while s_size < epset.shape[0]:
    mutualInfo[ep][s_size] = []
    mutualInfo[ep + '_shuffle'][s_size] = []
    # print(s_size)
    for j in range(10):
        # print(j)
        epset_sample = epset.sample(s_size)
        epset_sample1 = epset_sample.sample(s_size)
        a = np.array(epset_sample.cdr3a_endpadding_padded.str.split(', ', expand=True))
        b = np.array(epset_sample.cdr3b_endpadding_padded.str.split(', ', expand=True))
        b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))

        mi = cdr3_mi(a,b)
        shuffle_mi = cdr3_mi(a, b_s)

        mutualInfo[ep][s_size].append(mi)
        mutualInfo[ep + '_shuffle'][s_size].append(shuffle_mi)
    i += 1
    s_size = subsamples[i]

# finally calculate on entire set

s_size = epset.shape[0]
mutualInfo[ep][s_size] = []
mutualInfo[ep + '_shuffle'][s_size] = []
for j in range(10):
    # putting also real calcs in here because I want 10 points for the fitting
    a = np.array(epset.cdr3a_endpadding_padded.str.split(', ', expand=True))
    b = np.array(epset.cdr3b_endpadding_padded.str.split(', ', expand=True))    
    mi = cdr3_mi(a,b)
    mutualInfo[ep][s_size].append(mi)
    epset_sample1 = epset.sample(epset.shape[0])
    b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))
    mutualInfo[ep + '_shuffle'][s_size].append(cdr3_mi(a, b_s))

print(mutualInfo[ep].keys())
r = save_cdr3_MI(mutualInfo)
all_MI[ep] = r

background 9691
dict_keys([2, 5, 10, 15, 20, 25, 35, 50, 80, 100, 150, 200, 300, 500, 1000, 1500, 2000, 2500, 3000, 5000, 9691])


In [28]:
all_MI

{'ELAGIGILTV':                                                                     2  \
 ELAGIGILTV          [99.81319400063211, 135.85684738974928, 49.906...   
 ELAGIGILTV_shuffle  [99.81319400063211, 135.85684738974928, 49.906...   
 
                                                                     5  \
 ELAGIGILTV          [126.42528425330619, 169.53743126468737, 125.2...   
 ELAGIGILTV_shuffle  [126.66585311149542, 173.62293431997756, 123.5...   
 
                                                                    10  \
 ELAGIGILTV          [197.6311289507056, 133.41103209830243, 160.17...   
 ELAGIGILTV_shuffle  [197.92223211673803, 140.70321588793905, 157.6...   
 
                                                                    15  \
 ELAGIGILTV          [199.39031937506383, 156.66069910815497, 156.8...   
 ELAGIGILTV_shuffle  [193.53907865180634, 146.97498984821877, 161.8...   
 
                                                                    20  \
 ELAGIGILTV     

In [29]:
gene_cdr3_results = {'Va-CDR3a':pd.DataFrame(),
                     'Vb-CDR3b':pd.DataFrame(),
                     'Ja-CDR3a':pd.DataFrame(),
                     'Jb-CDR3b':pd.DataFrame(),
                     'Vb-CDR3a':pd.DataFrame(),
                     'Va-CDR3b':pd.DataFrame(),
                     'Jb-CDR3a':pd.DataFrame(),
                     'Ja-CDR3b':pd.DataFrame(),}

In [30]:
# for each epitope
for ep in vdj['Epitope'].unique():
    mutualInfo_VaVb = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_JaJb = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VaJa = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VbJb = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VaJb = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VbJa = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VaCDR3a = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_JaCDR3a = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VbCDR3a = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_JbCDR3a = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VaCDR3b = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_JaCDR3b = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_VbCDR3b = {ep:{}, ep+'_shuffle':{}}
    mutualInfo_JbCDR3b = {ep:{}, ep+'_shuffle':{}}

    epset = vdj.loc[vdj['Epitope'] == ep]
    print(ep, epset.shape[0])
    i = 0
    s_size = subsamples[0]
    while s_size < epset.shape[0]:
        mutualInfo_VaVb[ep][s_size] = []
        mutualInfo_JaJb[ep][s_size] = []
        mutualInfo_VaJa[ep][s_size] = []
        mutualInfo_VbJb[ep][s_size] = []
        mutualInfo_VaJb[ep][s_size] = []
        mutualInfo_VbJa[ep][s_size] = []
        mutualInfo_VaCDR3a[ep][s_size] = []
        mutualInfo_JaCDR3a[ep][s_size] = []
        mutualInfo_VbCDR3a[ep][s_size] = []
        mutualInfo_JbCDR3a[ep][s_size] = []
        mutualInfo_VaCDR3b[ep][s_size] = []
        mutualInfo_JaCDR3b[ep][s_size] = []
        mutualInfo_VbCDR3b[ep][s_size] = []
        mutualInfo_JbCDR3b[ep][s_size] = []
        
        mutualInfo_VaVb[ep+'_shuffle'][s_size] = []
        mutualInfo_JaJb[ep+'_shuffle'][s_size] = []
        mutualInfo_VaJa[ep+'_shuffle'][s_size] = []
        mutualInfo_VbJb[ep+'_shuffle'][s_size] = []
        mutualInfo_VaJb[ep+'_shuffle'][s_size] = []
        mutualInfo_VbJa[ep+'_shuffle'][s_size] = []
        mutualInfo_VaCDR3a[ep+'_shuffle'][s_size] = []
        mutualInfo_JaCDR3a[ep+'_shuffle'][s_size] = []
        mutualInfo_VbCDR3a[ep+'_shuffle'][s_size] = []
        mutualInfo_JbCDR3a[ep+'_shuffle'][s_size] = []
        mutualInfo_VaCDR3b[ep+'_shuffle'][s_size] = []
        mutualInfo_JaCDR3b[ep+'_shuffle'][s_size] = []
        mutualInfo_VbCDR3b[ep+'_shuffle'][s_size] = []
        mutualInfo_JbCDR3b[ep+'_shuffle'][s_size] = []

        # print(s_size)
        for j in range(10):
            # print(j)
            epset_sample = epset.sample(s_size)
            epset_sample1 = epset_sample.sample(s_size)
            a = np.array(epset_sample.cdr3a_endpadding_padded.str.split(', ', expand=True))
            b = np.array(epset_sample.cdr3b_endpadding_padded.str.split(', ', expand=True))
            a_s = np.array(epset_sample1.cdr3a_endpadding_padded.str.split(', ', expand=True))
            b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))

            mutualInfo_VaVb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['V-b']))
            mutualInfo_JaJb[ep][s_size].append(MI(epset_sample['J-a'], epset_sample['J-b']))
            mutualInfo_VaJa[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-a']))
            mutualInfo_VbJb[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-b']))
            mutualInfo_VaJb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-b']))
            mutualInfo_VbJa[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-a']))
            mutualInfo_VaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a))
            mutualInfo_JaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a))
            mutualInfo_VbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a))
            mutualInfo_JbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a))
            mutualInfo_VaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b))
            mutualInfo_JaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b))
            mutualInfo_VbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b))
            mutualInfo_JbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b))

            mutualInfo_VaVb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['V-b']))
            mutualInfo_JaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['J-a'], epset_sample1['J-b']))
            mutualInfo_VaJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-a']))
            mutualInfo_VbJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-b']))
            mutualInfo_VaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-b']))
            mutualInfo_VbJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-a']))
            mutualInfo_VaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a_s))
            mutualInfo_JaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a_s))
            mutualInfo_VbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a_s))
            mutualInfo_JbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a_s))
            mutualInfo_VaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b_s))
            mutualInfo_JaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b_s))
            mutualInfo_VbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b_s))
            mutualInfo_JbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b_s))

        i += 1
        s_size = subsamples[i]
    
    # finally calculate on entire set
    a = np.array(epset.cdr3a_endpadding_padded.str.split(', ', expand=True))
    b = np.array(epset.cdr3b_endpadding_padded.str.split(', ', expand=True))    

    epset_sample = epset.copy()
    s_size = epset.shape[0]
    mutualInfo_VaVb[ep][s_size] = []
    mutualInfo_JaJb[ep][s_size] = []
    mutualInfo_VaJa[ep][s_size] = []
    mutualInfo_VbJb[ep][s_size] = []
    mutualInfo_VaJb[ep][s_size] = []
    mutualInfo_VbJa[ep][s_size] = []
    mutualInfo_VaCDR3a[ep][s_size] = []
    mutualInfo_JaCDR3a[ep][s_size] = []
    mutualInfo_VbCDR3a[ep][s_size] = []
    mutualInfo_JbCDR3a[ep][s_size] = []
    mutualInfo_VaCDR3b[ep][s_size] = []
    mutualInfo_JaCDR3b[ep][s_size] = []
    mutualInfo_VbCDR3b[ep][s_size] = []
    mutualInfo_JbCDR3b[ep][s_size] = []

    mutualInfo_VaVb[ep+'_shuffle'][s_size] = []
    mutualInfo_JaJb[ep+'_shuffle'][s_size] = []
    mutualInfo_VaJa[ep+'_shuffle'][s_size] = []
    mutualInfo_VbJb[ep+'_shuffle'][s_size] = []
    mutualInfo_VaJb[ep+'_shuffle'][s_size] = []
    mutualInfo_VbJa[ep+'_shuffle'][s_size] = []
    mutualInfo_VaCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_JaCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_VbCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_JbCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_VaCDR3b[ep+'_shuffle'][s_size] = []
    mutualInfo_JaCDR3b[ep+'_shuffle'][s_size] = []
    mutualInfo_VbCDR3b[ep+'_shuffle'][s_size] = []
    mutualInfo_JbCDR3b[ep+'_shuffle'][s_size] = []
    
    for j in range(10):
        # putting also real calcs in here because I want 10 points for the fitting
        mutualInfo_VaVb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['V-b']))
        mutualInfo_JaJb[ep][s_size].append(MI(epset_sample['J-a'], epset_sample['J-b']))
        mutualInfo_VaJa[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-a']))
        mutualInfo_VbJb[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-b']))
        mutualInfo_VaJb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-b']))
        mutualInfo_VbJa[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-a']))
        mutualInfo_VaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a))
        mutualInfo_JaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a))
        mutualInfo_VbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a))
        mutualInfo_JbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a))
        mutualInfo_VaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b))
        mutualInfo_JaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b))
        mutualInfo_VbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b))
        mutualInfo_JbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b))
        epset_sample1 = epset.sample(epset.shape[0])
        a_s = np.array(epset_sample1.cdr3a_endpadding_padded.str.split(', ', expand=True))
        b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))

        mutualInfo_VaVb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['V-b']))
        mutualInfo_JaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['J-a'], epset_sample1['J-b']))
        mutualInfo_VaJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-a']))
        mutualInfo_VbJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-b']))
        mutualInfo_VaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-b']))
        mutualInfo_VbJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-a']))
        mutualInfo_VaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a_s))
        mutualInfo_JaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a_s))
        mutualInfo_VbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a_s))
        mutualInfo_JbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a_s))
        mutualInfo_VaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b_s))
        mutualInfo_JaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b_s))
        mutualInfo_VbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b_s))
        mutualInfo_JbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b_s))
    
    all_MI[ep] = save_gene_MI(mutualInfo_VaVb, all_MI[ep], 'Va-Vb')
    all_MI[ep] = save_gene_MI(mutualInfo_JaJb, all_MI[ep], 'Ja-Jb')
    all_MI[ep] = save_gene_MI(mutualInfo_VaJa, all_MI[ep], 'Va-Ja')
    all_MI[ep] = save_gene_MI(mutualInfo_VbJb, all_MI[ep], 'Vb-Jb')
    all_MI[ep] = save_gene_MI(mutualInfo_VaJb, all_MI[ep], 'Va-Jb')
    all_MI[ep] = save_gene_MI(mutualInfo_VbJa, all_MI[ep], 'Vb-Ja')
    
    
    all_MI[ep], gene_cdr3_results['Va-CDR3a'] = save_gene_cdr3_MI(mutualInfo_VaCDR3a, 
                                                all_MI[ep], gene_cdr3_results['Va-CDR3a'], 'Va-CDR3a', ep)
    all_MI[ep], gene_cdr3_results['Vb-CDR3b'] = save_gene_cdr3_MI(mutualInfo_VbCDR3b, 
                                                all_MI[ep], gene_cdr3_results['Vb-CDR3b'], 'Vb-CDR3b', ep)
    all_MI[ep], gene_cdr3_results['Ja-CDR3a'] = save_gene_cdr3_MI(mutualInfo_JaCDR3a, 
                                                all_MI[ep], gene_cdr3_results['Ja-CDR3a'], 'Ja-CDR3a', ep)
    all_MI[ep], gene_cdr3_results['Jb-CDR3b'] = save_gene_cdr3_MI(mutualInfo_JbCDR3b, 
                                                all_MI[ep], gene_cdr3_results['Jb-CDR3b'], 'Jb-CDR3b', ep)
    
    all_MI[ep], gene_cdr3_results['Va-CDR3b'] = save_gene_cdr3_MI(mutualInfo_VaCDR3b, 
                                                all_MI[ep], gene_cdr3_results['Va-CDR3b'], 'Va-CDR3b', ep)
    all_MI[ep], gene_cdr3_results['Vb-CDR3a'] = save_gene_cdr3_MI(mutualInfo_VbCDR3a, 
                                                all_MI[ep], gene_cdr3_results['Vb-CDR3a'], 'Vb-CDR3a', ep)
    all_MI[ep], gene_cdr3_results['Ja-CDR3b'] = save_gene_cdr3_MI(mutualInfo_JaCDR3b, 
                                                all_MI[ep], gene_cdr3_results['Ja-CDR3b'], 'Ja-CDR3b', ep)
    all_MI[ep], gene_cdr3_results['Jb-CDR3a'] = save_gene_cdr3_MI(mutualInfo_JbCDR3a, 
                                                all_MI[ep], gene_cdr3_results['Jb-CDR3a'], 'Jb-CDR3a', ep)

ELAGIGILTV 380
GILGFVFTL 1853
NLVPMVATV 353
GLCTLVAML 343
SSLENFRAYV 348
ASNENMETM 187
YLQPRTFLL 333
LLWNGPMAV 235
CINGVCWTV 226
KSKRTPMGF 90
ATDALMTGF 123
AVFDRKSDAK 1699
IVTDFSVIK 704
RAKFKQLL 1195
RLRAEAQVK 412
HGIRNASFI 240
SSPPMFRV 125
LSLRNPILV 127
SSYRRPVGI 177
LTDEMIAQY 124
TTDPSFLGRY 242
SPRWYFYYL 175


In [31]:
# for background

ep = 'background'

mutualInfo_VaVb = {ep:{}, ep+'_shuffle':{}}
mutualInfo_JaJb = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VaJa = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VbJb = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VaJb = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VbJa = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VaCDR3a = {ep:{}, ep+'_shuffle':{}}
mutualInfo_JaCDR3a = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VbCDR3a = {ep:{}, ep+'_shuffle':{}}
mutualInfo_JbCDR3a = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VaCDR3b = {ep:{}, ep+'_shuffle':{}}
mutualInfo_JaCDR3b = {ep:{}, ep+'_shuffle':{}}
mutualInfo_VbCDR3b = {ep:{}, ep+'_shuffle':{}}
mutualInfo_JbCDR3b = {ep:{}, ep+'_shuffle':{}}

epset = vdj.copy()
print(ep, epset.shape[0])
i = 0
s_size = subsamples[0]
while s_size < epset.shape[0]:
    mutualInfo_VaVb[ep][s_size] = []
    mutualInfo_JaJb[ep][s_size] = []
    mutualInfo_VaJa[ep][s_size] = []
    mutualInfo_VbJb[ep][s_size] = []
    mutualInfo_VaJb[ep][s_size] = []
    mutualInfo_VbJa[ep][s_size] = []
    mutualInfo_VaCDR3a[ep][s_size] = []
    mutualInfo_JaCDR3a[ep][s_size] = []
    mutualInfo_VbCDR3a[ep][s_size] = []
    mutualInfo_JbCDR3a[ep][s_size] = []
    mutualInfo_VaCDR3b[ep][s_size] = []
    mutualInfo_JaCDR3b[ep][s_size] = []
    mutualInfo_VbCDR3b[ep][s_size] = []
    mutualInfo_JbCDR3b[ep][s_size] = []
    
    mutualInfo_VaVb[ep+'_shuffle'][s_size] = []
    mutualInfo_JaJb[ep+'_shuffle'][s_size] = []
    mutualInfo_VaJa[ep+'_shuffle'][s_size] = []
    mutualInfo_VbJb[ep+'_shuffle'][s_size] = []
    mutualInfo_VaJb[ep+'_shuffle'][s_size] = []
    mutualInfo_VbJa[ep+'_shuffle'][s_size] = []
    mutualInfo_VaCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_JaCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_VbCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_JbCDR3a[ep+'_shuffle'][s_size] = []
    mutualInfo_VaCDR3b[ep+'_shuffle'][s_size] = []
    mutualInfo_JaCDR3b[ep+'_shuffle'][s_size] = []
    mutualInfo_VbCDR3b[ep+'_shuffle'][s_size] = []
    mutualInfo_JbCDR3b[ep+'_shuffle'][s_size] = []

    # print(s_size)
    for j in range(10):
        # print(j)
        epset_sample = epset.sample(s_size)
        epset_sample1 = epset_sample.sample(s_size)
        a = np.array(epset_sample.cdr3a_endpadding_padded.str.split(', ', expand=True))
        b = np.array(epset_sample.cdr3b_endpadding_padded.str.split(', ', expand=True))
        a_s = np.array(epset_sample1.cdr3a_endpadding_padded.str.split(', ', expand=True))
        b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))

        mutualInfo_VaVb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['V-b']))
        mutualInfo_JaJb[ep][s_size].append(MI(epset_sample['J-a'], epset_sample['J-b']))
        mutualInfo_VaJa[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-a']))
        mutualInfo_VbJb[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-b']))
        mutualInfo_VaJb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-b']))
        mutualInfo_VbJa[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-a']))
        mutualInfo_VaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a))
        mutualInfo_JaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a))
        mutualInfo_VbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a))
        mutualInfo_JbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a))
        mutualInfo_VaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b))
        mutualInfo_JaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b))
        mutualInfo_VbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b))
        mutualInfo_JbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b))

        mutualInfo_VaVb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['V-b']))
        mutualInfo_JaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['J-a'], epset_sample1['J-b']))
        mutualInfo_VaJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-a']))
        mutualInfo_VbJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-b']))
        mutualInfo_VaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-b']))
        mutualInfo_VbJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-a']))
        mutualInfo_VaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a_s))
        mutualInfo_JaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a_s))
        mutualInfo_VbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a_s))
        mutualInfo_JbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a_s))
        mutualInfo_VaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b_s))
        mutualInfo_JaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b_s))
        mutualInfo_VbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b_s))
        mutualInfo_JbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b_s))

    i += 1
    s_size = subsamples[i]

# finally calculate on entire set
a = np.array(epset.cdr3a_endpadding_padded.str.split(', ', expand=True))
b = np.array(epset.cdr3b_endpadding_padded.str.split(', ', expand=True))    

epset_sample = epset.copy()
s_size = epset.shape[0]
mutualInfo_VaVb[ep][s_size] = []
mutualInfo_JaJb[ep][s_size] = []
mutualInfo_VaJa[ep][s_size] = []
mutualInfo_VbJb[ep][s_size] = []
mutualInfo_VaJb[ep][s_size] = []
mutualInfo_VbJa[ep][s_size] = []
mutualInfo_VaCDR3a[ep][s_size] = []
mutualInfo_JaCDR3a[ep][s_size] = []
mutualInfo_VbCDR3a[ep][s_size] = []
mutualInfo_JbCDR3a[ep][s_size] = []
mutualInfo_VaCDR3b[ep][s_size] = []
mutualInfo_JaCDR3b[ep][s_size] = []
mutualInfo_VbCDR3b[ep][s_size] = []
mutualInfo_JbCDR3b[ep][s_size] = []

mutualInfo_VaVb[ep+'_shuffle'][s_size] = []
mutualInfo_JaJb[ep+'_shuffle'][s_size] = []
mutualInfo_VaJa[ep+'_shuffle'][s_size] = []
mutualInfo_VbJb[ep+'_shuffle'][s_size] = []
mutualInfo_VaJb[ep+'_shuffle'][s_size] = []
mutualInfo_VbJa[ep+'_shuffle'][s_size] = []
mutualInfo_VaCDR3a[ep+'_shuffle'][s_size] = []
mutualInfo_JaCDR3a[ep+'_shuffle'][s_size] = []
mutualInfo_VbCDR3a[ep+'_shuffle'][s_size] = []
mutualInfo_JbCDR3a[ep+'_shuffle'][s_size] = []
mutualInfo_VaCDR3b[ep+'_shuffle'][s_size] = []
mutualInfo_JaCDR3b[ep+'_shuffle'][s_size] = []
mutualInfo_VbCDR3b[ep+'_shuffle'][s_size] = []
mutualInfo_JbCDR3b[ep+'_shuffle'][s_size] = []

for j in range(10):
    # putting also real calcs in here because I want 10 points for the fitting
    mutualInfo_VaVb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['V-b']))
    mutualInfo_JaJb[ep][s_size].append(MI(epset_sample['J-a'], epset_sample['J-b']))
    mutualInfo_VaJa[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-a']))
    mutualInfo_VbJb[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-b']))
    mutualInfo_VaJb[ep][s_size].append(MI(epset_sample['V-a'], epset_sample['J-b']))
    mutualInfo_VbJa[ep][s_size].append(MI(epset_sample['V-b'], epset_sample['J-a']))
    mutualInfo_VaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a))
    mutualInfo_JaCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a))
    mutualInfo_VbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a))
    mutualInfo_JbCDR3a[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a))
    mutualInfo_VaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b))
    mutualInfo_JaCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b))
    mutualInfo_VbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b))
    mutualInfo_JbCDR3b[ep][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b))
    epset_sample1 = epset.sample(epset.shape[0])
    a_s = np.array(epset_sample1.cdr3a_endpadding_padded.str.split(', ', expand=True))
    b_s = np.array(epset_sample1.cdr3b_endpadding_padded.str.split(', ', expand=True))

    mutualInfo_VaVb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['V-b']))
    mutualInfo_JaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['J-a'], epset_sample1['J-b']))
    mutualInfo_VaJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-a']))
    mutualInfo_VbJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-b']))
    mutualInfo_VaJb[ep+'_shuffle'][s_size].append(MI(epset_sample['V-a'], epset_sample1['J-b']))
    mutualInfo_VbJa[ep+'_shuffle'][s_size].append(MI(epset_sample['V-b'], epset_sample1['J-a']))
    mutualInfo_VaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], a_s))
    mutualInfo_JaCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], a_s))
    mutualInfo_VbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], a_s))
    mutualInfo_JbCDR3a[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], a_s))
    mutualInfo_VaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-a'], b_s))
    mutualInfo_JaCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-a'], b_s))
    mutualInfo_VbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['V-b'], b_s))
    mutualInfo_JbCDR3b[ep+'_shuffle'][s_size].append(cdr3_with_cat_MI(epset_sample['J-b'], b_s))

all_MI[ep] = save_gene_MI(mutualInfo_VaVb, all_MI[ep], 'Va-Vb')
all_MI[ep] = save_gene_MI(mutualInfo_JaJb, all_MI[ep], 'Ja-Jb')
all_MI[ep] = save_gene_MI(mutualInfo_VaJa, all_MI[ep], 'Va-Ja')
all_MI[ep] = save_gene_MI(mutualInfo_VbJb, all_MI[ep], 'Vb-Jb')
all_MI[ep] = save_gene_MI(mutualInfo_VaJb, all_MI[ep], 'Va-Jb')
all_MI[ep] = save_gene_MI(mutualInfo_VbJa, all_MI[ep], 'Vb-Ja')


all_MI[ep], gene_cdr3_results['Va-CDR3a'] = save_gene_cdr3_MI(mutualInfo_VaCDR3a, 
                                            all_MI[ep], gene_cdr3_results['Va-CDR3a'], 'Va-CDR3a', ep)
all_MI[ep], gene_cdr3_results['Vb-CDR3b'] = save_gene_cdr3_MI(mutualInfo_VbCDR3b, 
                                            all_MI[ep], gene_cdr3_results['Vb-CDR3b'], 'Vb-CDR3b', ep)
all_MI[ep], gene_cdr3_results['Ja-CDR3a'] = save_gene_cdr3_MI(mutualInfo_JaCDR3a, 
                                            all_MI[ep], gene_cdr3_results['Ja-CDR3a'], 'Ja-CDR3a', ep)
all_MI[ep], gene_cdr3_results['Jb-CDR3b'] = save_gene_cdr3_MI(mutualInfo_JbCDR3b, 
                                            all_MI[ep], gene_cdr3_results['Jb-CDR3b'], 'Jb-CDR3b', ep)

all_MI[ep], gene_cdr3_results['Va-CDR3b'] = save_gene_cdr3_MI(mutualInfo_VaCDR3b, 
                                            all_MI[ep], gene_cdr3_results['Va-CDR3b'], 'Va-CDR3b', ep)
all_MI[ep], gene_cdr3_results['Vb-CDR3a'] = save_gene_cdr3_MI(mutualInfo_VbCDR3a, 
                                            all_MI[ep], gene_cdr3_results['Vb-CDR3a'], 'Vb-CDR3a', ep)
all_MI[ep], gene_cdr3_results['Ja-CDR3b'] = save_gene_cdr3_MI(mutualInfo_JaCDR3b, 
                                            all_MI[ep], gene_cdr3_results['Ja-CDR3b'], 'Ja-CDR3b', ep)
all_MI[ep], gene_cdr3_results['Jb-CDR3a'] = save_gene_cdr3_MI(mutualInfo_JbCDR3a, 
                                            all_MI[ep], gene_cdr3_results['Jb-CDR3a'], 'Jb-CDR3a', ep)

background 9691


In [32]:
for ep in all_MI.keys():
    all_MI[ep].to_csv('data/output/mutual_info/mutual_info_' + ep + '_endpadding.csv')

In [33]:
for stringtype in gene_cdr3_results:
    gene_cdr3_results[stringtype].to_csv('data/output/mutual_info/' + stringtype + '_mutual_info_by_pos_vdjdb_endpadding.csv')