In [None]:
import ipywidgets
import xomx
import numpy as np
import pandas as pd
import scanpy as sc
import logomaker
import matplotlib.pyplot as plt
from scipy.stats import entropy
import os
import joblib

In [None]:
# To display interactive plots:
%matplotlib widget

We define `save_dir`, the folder in which everything will be saved.

In [None]:
save_dir = os.path.join(os.path.expanduser('~'), 'results', 'xomx', 'tutos', 'xomx_tcr')
os.makedirs(save_dir, exist_ok=True)

We import both the vdjdb and the MsPAS-TCR databases:

In [None]:
vdjdb_file = 'vdjdb-2021-09-05.zip'
vdjdb_url = os.path.join('https://github.com/antigenomics/vdjdb-db/releases/download/2021-09-05/', vdjdb_file)
mcpas_tcr_file = 'McPAS-TCR.csv'
mcpas_tcr_url = 'http://friedmanlab.weizmann.ac.il/McPAS-TCR/session/1a02e7df9f10319305422b2d96c18f12/download/downloadDB'

if not os.path.isfile(os.path.join(save_dir, vdjdb_file)):
    !wget {vdjdb_url} --directory-prefix={save_dir}
    !unzip {os.path.join(save_dir, vdjdb_file)} -d {save_dir}
    
if not os.path.isfile(os.path.join(save_dir, mcpas_tcr_file)):
    !wget {mcpas_tcr_url} --output-document={os.path.join(save_dir, mcpas_tcr_file)}

We convert the vdjdb and the MsPAS-TCR databases as pandas DataFrames, go through these dataframes, and construct two dictionnaries:  
`dic_cdr3beta` and `dic_epitopes`. 

For every CDR3 beta sequence `s`, `dic_cdr3beta[s]` is the set of epitopes to which it is associated.  
For every epitope sequence `s`, `dic_epitopes[s]` is the set of CDR3 beta sequences to which it is associated.

**Remark: we only take into account CDR3 sequences of length at most 22.**

In [None]:
dic_epitopes_file = 'dic_epitopes.joblib'
dic_cdr3beta_file = 'dic_cdr3beta.joblib'
cdr3_max_length = 22

if not os.path.isfile(os.path.join(save_dir, dic_epitopes_file)) or not not os.path.isfile(os.path.join(save_dir, dic_cdr3beta_file)):
    vdjdb_df = pd.read_csv(os.path.join(save_dir, 'vdjdb_full.txt'), delimiter="\t", low_memory=False)
    mcpas_tcr_df = pd.read_csv("/home/perrin/Desktop/data/McPAS-TCR.csv", encoding='cp1252', delimiter=",", low_memory=False)

    dic_cdr3beta = {}
    dic_epitopes = {}

    def dic_iteration(cdr3beta_seq, epitope_seq):
        if cdr3beta_seq == cdr3beta_seq and epitope_seq == epitope_seq:  # filter NaNs
            if not set(epitope_seq).difference(xomx.tl.aminoacids) and not set(cdr3beta_seq).difference(xomx.tl.aminoacids):  #filter undefined symbols
                if len(cdr3beta_seq) <= cdr3_max_length:  # filter long sequences
                    dic_cdr3beta.setdefault(cdr3beta_seq, set())
                    dic_cdr3beta[cdr3beta_seq].add(epitope_seq)
                    dic_epitopes.setdefault(epitope_seq, set())
                    dic_epitopes[epitope_seq].add(cdr3beta_seq)

    for i in range(len(vdjdb_df)):
        cdr3beta = vdjdb_df["cdr3.beta"].values[i]
        epitope = vdjdb_df["antigen.epitope"].values[i]
        dic_iteration(cdr3beta, epitope)

    for i in range(len(mcpas_tcr_df)):
        cdr3beta = mcpas_tcr_df["CDR3.beta.aa"].values[i]
        epitope = mcpas_tcr_df["Epitope.peptide"].values[i]
        dic_iteration(cdr3beta, epitope)
    
    joblib.dump(dic_epitopes, os.path.join(save_dir, dic_epitopes_file))
    joblib.dump(dic_cdr3beta, os.path.join(save_dir, dic_cdr3beta_file))
else:
    dic_epitopes = joblib.load(os.path.join(save_dir, dic_epitopes_file))
    dic_cdr3beta = joblib.load(os.path.join(save_dir, dic_cdr3beta_file))

In [None]:
print(f'{len(dic_cdr3beta)} CDR3 beta sequences in total')
cdr3_single_epitope = set()
for key, value in dic_cdr3beta.items():
    if len(value) == 1:
        cdr3_single_epitope.add(key)
print(f'{len(cdr3_single_epitope)} CDR3 beta sequences that recognize a unique epitope')

Optionally, we recompute `dic_epitopes` to keep only CDR3 beta sequences that are recognize a unique epitope:

In [None]:
dic_epitopes= {}
for key in dic_cdr3beta:
    if len(dic_cdr3beta[key]) == 1:
        epitope = list(dic_cdr3beta[key])[0]
        dic_epitopes.setdefault(epitope, set())
        dic_epitopes[epitope].add(key)
        assert(key in cdr3_single_epitope)

We sort the epitopes by decreasing number of associated CDR3 beta sequences:

In [None]:
sorted_epitopes = sorted(dic_epitopes, key=lambda k: len(dic_epitopes[k]), reverse=True)

We will use one-hot encodings to represent the CDR3 sequences. Their dimension is `cdr3_max_length` x `len(xomx.tl.aminoacids)`:

In [None]:
dimension = cdr3_max_length * len(xomx.tl.aminoacids)
dimension

We construct annotated data with the one-hot encodings of the CDR3 sequences corresponding to the `K=30` most recognized epitopes.

In [None]:
K = 30
nr_samples = sum([len(dic_epitopes[sorted_epitopes[j]]) for j in range(K)])
nr_samples

In [None]:
xd = sc.AnnData(shape=(nr_samples, dimension))
xd.obs_names = np.hstack([sorted(list(dic_epitopes[sorted_epitopes[j]])) for j in range(K)])
xd.obs['labels'] = np.hstack([[sorted_epitopes[j]] * len(dic_epitopes[sorted_epitopes[j]]) for j in range(K)])
xd.uns['all_labels'] = xomx.tl.all_labels(xd.obs['labels'])
xd.uns['obs_indices_per_label'] = xomx.tl.indices_per_label(xd.obs['labels'])
xd.X = np.zeros((xd.n_obs, xd.n_vars))
for i in range(xd.n_obs):
    xd.X[i, :] = xomx.tl.onehot(xd.obs_names[i], cdr3_max_length)

We separate the training and test sets:

In [None]:
rng = np.random.RandomState(0)
xomx.tl.train_and_test_indices(xd, "obs_indices_per_label", test_train_ratio=0.25, rng=rng)

A UMAP plot based on 1000 randomly chosen samples:

In [None]:
xomx.pl.umap_plot(xd, subset_indices=rng.choice(xd.n_obs, 1000, replace=False))

In [None]:
classifier = {}

In [None]:
xd.uns['all_labels']

We construct binary classifiers (using the Extra-Trees algorithm) to learn to discriminate between CDR3 sequences associated to a given epitope, and CDR3 sequences associated to other epitopes.  
Here we define a classifier for the epitope GILGFVFTL:

In [None]:
index = 2
classifier[xd.uns['all_labels'][index]] = xomx.fs.RFEExtraTrees(
    xd,
    xd.uns['all_labels'][index],
    n_estimators=450,
    random_state=rng,
)
xd.uns['all_labels'][index]

We train the classifier on the training set:

In [None]:
classifier[xd.uns['all_labels'][index]].init()

We plot the result of the classifier on 6000 random samples from the test set (points above the red line are classified as GILGFVFTL-related):

In [None]:
classifier[xd.uns['all_labels'][index]].plot(random_subset_size=6000,rng=rng)

We compute the Matthews Correlation Coefficient on the test set:

In [None]:
xomx.tl.matthews_coef(classifier[xd.uns['all_labels'][index]].confusion_matrix)

Here are all the predictions on the test set (`True` means that classifier considers the sequence to be GILGFVFTL-related):

In [None]:
predictions_on_test_set = (classifier[xd.uns['all_labels'][index]].predict(xd.X[xd.uns['test_indices']]) == 1)
predictions_on_test_set

We gather the indices of the CDR3 sequences in the test set that are classified as GILGFVFTL-related:

In [None]:
ok_samples_test = [xd.uns['test_indices'][j] for j, val in enumerate(predictions_on_test_set) if val]

We plot the logo computed from all these CDR3 sequences:

In [None]:
xomx.pl.plot_logo(xd, ok_samples_test)

We can also filter to plot the logo for sequences of a fixed length.  
Here, sequences of length 12 (in the test set and classified as GILGFVFTL-related):

In [None]:
xomx.pl.plot_logo(xd, ok_samples_test, fixed_length=12)

And sequences of length 13:

In [None]:
xomx.pl.plot_logo(xd, ok_samples_test, fixed_length=13)

We can compare these logos to the logos computed from the CDR3 sequences in the training set that are labelled as GILGFVFTL:

In [None]:
train_samples_ref_label = xd.uns['train_indices_per_label'][xd.uns['all_labels'][index]]

In [None]:
xomx.pl.plot_logo(xd, all_samples_ref_label, fixed_length=12)

In [None]:
xomx.pl.plot_logo(xd, train_samples_ref_label, fixed_length=13)