In [1]:
import sys

import pandas
import matplotlib.pyplot
import seaborn
import numpy

import functions.io
import functions.metrics
import functions.graphs
import config

seaborn.set(style='white', font_scale=1.25)
%matplotlib inline

def get_sample_status(x):
    """Convenience function to create column with sample status"""

    status_int = int(x.split("_")[2])
    
    if status_int <= 3:
        status = 'CTRL'
    elif 3 < status_int <= 6:
        status = 'DIABETES'

    return status


def apply_color(x):
    """Convenience function to create a column of strings of colors"""

    if x is np.nan:
        col = 'black'
    else:
        col = 'red'

    return col

In [2]:
data = functions.io.read_all_samples(config.Config.DATA_DIRECTORY)

In [3]:
## Distribution of Frequencies

samples = functions.io.get_sample_names(data)
functions.metrics.plot_frequency_distribution(data, samples, config.Config.OUTPUT_DIRECTORY)

In [4]:
## Jaccard Index

functions.metrics.pairwise_jaccard_heatmap(data, samples, config.Config.OUTPUT_DIRECTORY)

Unnamed: 0,dcr_LAC_0001_00_alpha,dcr_LAC_0002_00_alpha,dcr_LAC_0003_00_alpha,dcr_LAC_0004_00_alpha,dcr_LAC_0005_00_alpha,dcr_LAC_0006_00_alpha
dcr_LAC_0001_00_alpha,1.0,0.232768,0.04664,0.114973,0.188253,0.095189
dcr_LAC_0002_00_alpha,0.232768,1.0,0.121637,0.13732,0.177458,0.111796
dcr_LAC_0003_00_alpha,0.04664,0.121637,1.0,0.044228,0.050997,0.036351
dcr_LAC_0004_00_alpha,0.114973,0.13732,0.044228,1.0,0.212375,0.143182
dcr_LAC_0005_00_alpha,0.188253,0.177458,0.050997,0.212375,1.0,0.18642
dcr_LAC_0006_00_alpha,0.095189,0.111796,0.036351,0.143182,0.18642,1.0


In [5]:
## Shannon Entropy

shannon_results = functions.metrics.get_shannon_entropy(data, samples)

shannon_results['status'] = shannon_results['sample'].apply(lambda x: get_sample_status(x))

fig, ax = matplotlib.pyplot.subplots(figsize=(2.5, 5))
seaborn.swarmplot(x='status', y='shannon', data=shannon_results, size=10, hue='status')
ax.set_xlabel('')
ax.set_ylabel('Shannon Entropy')
ax.legend().set_visible(False)
seaborn.despine()
matplotlib.pyplot.savefig(config.Config.OUTPUT_DIRECTORY+'shannon.png', bbox_inches='tight')
matplotlib.pyplot.close()

In [6]:
## Compare CDR3s in samples with known specificities from VDJdb

chain = config.Config.CHAIN

specs = pandas.read_csv('data/specificities.tsv', sep='\t')[['Gene', 'CDR3', 'V', 'J', 'MHC A', 'Epitope species']]
specs = specs.replace('HomoSapiens', numpy.nan)

if chain == 'alpha':
    specs = specs[(specs['Gene'] == 'TRA') & (specs['MHC A'] == 'HLA-A*02')]
elif chain == 'beta':
    specs = specs[(specs['Gene'] == 'TRB') & (specs['MHC A'] == 'HLA-A*02')]

seq_specs = functions.metrics.match_CDR3s_to_known_specificities(data, specs)
functions.metrics.plot_frequency_distribution_with_specificities(seq_specs, config.Config.OUTPUT_DIRECTORY, top_n=50)