In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import trange
import random
import math
from scipy import interp
import statistics 
import os

from tcrmodels.ergo2.model import ERGO2
from tcrmodels.nettcr2.model import NetTCR2
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, auc

from matplotlib import collections
from matplotlib import colors
from numpy.random import normal

Using TensorFlow backend.


In [2]:
metrics = [
    'AUROC',
    'Accuracy',
    'Recall',
    'Precision',
    'F1 score',
    'AUPR'
]

def pr_auc(y_true, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)
    return pr_auc

def get_scores(y_true, y_prob, y_pred):
    """
    Compute a df with all classification metrics and respective scores.
    """
    
    scores = [
        roc_auc_score(y_true, y_prob),
        accuracy_score(y_true, y_pred),
        recall_score(y_true, y_pred),
        precision_score(y_true, y_pred),
        f1_score(y_true, y_pred),
        pr_auc(y_true, y_prob)
    ]
    
    df = pd.DataFrame(data={'score': scores, 'metrics': metrics})
    return df

In [3]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)

In [4]:
login = os.getlogin( )
DATA_BASE = f"/mnt/container-nle-tcr/tc-hard-data/tc-hard/ds.vdjdb-high-scores.hard-splits/pep+cdr3b/"
RESULTS_BASE = f"/home/{login}/Git/tc-hard/notebooks/notebooks.classification.vdjdb-high-scores/results/"

In [22]:
def make_ergo_train_df(df):
    df = df.drop(columns=["negative.source", "mhc.a", 'v.alpha', 'j.alpha', 'v.beta', 'd.beta', 'j.beta', 'license']).reset_index(drop=True)
    
    map_keys = {
    'cdr3.alpha': 'tcra',
    'cdr3.beta': 'tcrb',
    'antigen.epitope': 'peptide',
    'mhc.seq': 'mhc',
    'label': 'sign'
    }
    df = df.rename(columns={c: map_keys[c] for c in df.columns})

    # the ERGO II implementation expected the following columns to be preset in the dataframe
    # even if they are not used
    df['va'] = pd.NA
    df['vb'] = pd.NA
    df['ja'] = pd.NA
    df['jb'] = pd.NA
    df['t_cell_type'] = pd.NA
    df['protein'] = pd.NA

    # using "UNK" for identifier of missing CDR3α
    df['tcra'] = "UNK"
    
    df['tcrb'] = df['tcrb'].str.replace('O','X')
    df['peptide'] = df['peptide'].str.replace('O','X')

    return df

def make_ergo_test_df(df):
    df = df.drop(columns=["negative.source", "mhc.a", 'v.alpha', 'j.alpha', 'v.beta', 'd.beta', 'j.beta', 'license']).reset_index(drop=True)
    
    map_keys = {
    'cdr3.alpha': 'TRA',
    'cdr3.beta': 'TRB',
    'antigen.epitope': 'Peptide',
    'mhc.seq': 'MHC',
    'label': 'sign'
    }
    df = df.rename(columns={c: map_keys[c] for c in df.columns})

    # the ERGO II implementation expected the following columns to be preset in the dataframe
    # even if they are not used
    df['TRAV'] = pd.NA
    df['TRBV'] = pd.NA
    df['TRAJ'] = pd.NA
    df['TRBJ'] = pd.NA
    df['T-Cell-Type'] = pd.NA
    df['Protein'] = pd.NA

    # using "UNK" for identifier of missing CDR3α
    df['tcra'] = "UNK"
    df['TRB'] = df['TRB'].str.replace('O','X')
    df['Peptide'] = df['Peptide'].str.replace('O','X')

    return df

# ERGO II - Hard split (Test: only randomized negatives) - Train: only randomized negatives

In [23]:
results_ergo2 = []

for i in tqdm(range(5)):
    df_train = make_ergo_train_df(
        pd.read_csv(DATA_BASE+"train/only-sampled-negs/"+f"train-{i}.csv")
    )
    
    df_test = make_ergo_test_df(
        pd.read_csv(DATA_BASE+"test/only-sampled-negs/"+f"test-{i}.csv")
    )
    
    model = ERGO2(
        gpu=[0],
        use_alpha=False,
        random_seed=i,
        train_val_ratio=.2,
    )
    model.train(df_train, epochs=1000)
    prediction_df = model.test(df_test)

    scores_df = get_scores(
        y_true=prediction_df['sign'].to_numpy(), 
        y_prob=prediction_df['prediction'].to_numpy(),
        y_pred=prediction_df['prediction'].to_numpy().round(),
    )
    scores_df['experiment'] = i
    results_ergo2.append(scores_df)
    df_test['prediction'] = prediction_df['prediction']
    df_test.to_csv(RESULTS_BASE+f"ergo2.pep+cdr3b.only-sampled-negs.hard-split.{i}.csv", index=False)

results_ergo2 = pd.concat(results_ergo2)
results_ergo2.to_csv(RESULTS_BASE+"ergo2.pep+cdr3b.only-sampled-negs.hard-split.csv", index=False)

  0%|          | 0/5 [00:00<?, ?it/s]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 20%|██        | 1/5 [00:40<02:42, 40.67s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 40%|████      | 2/5 [01:10<01:43, 34.39s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 60%|██████    | 3/5 [01:41<01:05, 32.60s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 80%|████████  | 4/5 [02:54<00:48, 48.65s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

100%|██████████| 5/5 [03:24<00:00, 40.86s/it]


# ERGO II - Random split - Train and test: only randomized negatives

In [25]:
results_ergo2 = []

for i in tqdm(range(5)):
    df = pd.read_csv(f"/mnt/container-nle-tcr/tc-hard-data/tc-hard/ds.vdjdb-high-scores.csv")
    
    df = df.drop_duplicates(
        subset=["antigen.epitope", "cdr3.beta", "label"], keep="first"
    ).reset_index(drop=True)
    
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=i)
    df_train = make_ergo_train_df(df_train)
    df_test = make_ergo_test_df(df_test)

    model = ERGO2(
        gpu=[0],
        use_alpha=False,
        random_seed=i,
        train_val_ratio=.2,
    )
    model.train(df_train, epochs=1000)
    prediction_df = model.test(df_test)

    scores_df = get_scores(
        y_true=prediction_df['sign'].to_numpy(), 
        y_prob=prediction_df['prediction'].to_numpy(),
        y_pred=prediction_df['prediction'].to_numpy().round(),
    )
    scores_df['experiment'] = i
    results_ergo2.append(scores_df)
    df_test['prediction'] = prediction_df['prediction']
    df_test.to_csv(RESULTS_BASE+f"ergo2.pep+cdr3b.only-sampled-negs.random-split.{i}.csv", index=False)

results_ergo2 = pd.concat(results_ergo2)
results_ergo2.to_csv(RESULTS_BASE+"ergo2.pep+cdr3b.only-sampled-negs.random-split.csv", index=False)

  0%|          | 0/5 [00:00<?, ?it/s]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 20%|██        | 1/5 [00:22<01:31, 22.97s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 40%|████      | 2/5 [00:53<01:22, 27.44s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 60%|██████    | 3/5 [01:24<00:58, 29.15s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

 80%|████████  | 4/5 [01:56<00:30, 30.27s/it]GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | tcrb_encoder  | LSTM_Encoder | 3 M   
1 | pep_encoder   | LSTM_Encoder | 3 M   
2 | hidden_layer1 | Linear       | 31 K  
3 | relu          | LeakyReLU    | 0     
4 | output_layer1 | Linear       | 32    
5 | dropout       | Dropout      | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

100%|██████████| 5/5 [02:28<00:00, 29.62s/it]
