In [1]:
import pandas as pd
import numpy as np
import random
import os
import operator

import sys
import logging
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler(stream=sys.stdout))
logger = logging.getLogger()
logger.setLevel('INFO')

from typing import Tuple, List

In [2]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)

In [3]:
login = os.getlogin( )
DATA_BASE = f'/mnt/container-nle-tcr/tc-hard-data/tc-hard/'
RESULTS_BASE = f"/home/{login}/Git/tc-hard/notebooks/notebooks.classification/results/"

In [4]:
def hard_split_df(
        df: pd.DataFrame, target_col: str, min_ratio: float, random_state: float, low: int, high: int
) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
    """ Assume a target column, e.g. `peptide`.
    Then:
        1) Select random sample
        2) All samples sharing the same value of that column
        with the randomly selected sample are used for test
        3)Repeat until test budget (defined by train/test ratio) is
        filled.
    """
    min_test_len = round(min_ratio * len(df))
    test_len = 0
    selected_target_val = []

    train_df = df.copy()
    test_df = pd.DataFrame()
    
    target_count_df = df.groupby([target_col]).size().reset_index(name='counts')
    target_count_df = target_count_df[target_count_df['counts'] >= low]
    target_count_df = target_count_df[target_count_df['counts'] <= high]
    possible_target_val = list(target_count_df[target_col].unique())
    max_target_len = len(possible_target_val)

    print(f"Tot pep: {len(df[target_col].unique())}, Possible test pep: {len(target_count_df[target_col].unique())}")

    while test_len < min_test_len:
#         sample = train_df.sample(n=1, random_state=random_state)
#         target_val = sample[target_col].values[0]
        target_val = np.random.choice(possible_target_val)

        if target_val not in selected_target_val:
            to_test = train_df[train_df[target_col] == target_val]

            train_df = train_df.drop(to_test.index)
            test_df = test_df.append(to_test)
            test_len = len(test_df)

            selected_target_val.append(target_val)
            possible_target_val.remove(target_val)

        if len(selected_target_val) == max_target_len:
            logger.info(f"Possible targets left {possible_target_val}")
            raise Exception('No more values to sample from.')

    logger.info(f"Target {target_col} sequences: {selected_target_val}")

    return train_df, test_df, selected_target_val

# Pep+CDR3b

In [9]:
df = pd.read_csv(DATA_BASE+f"ds.vdjdb-high-scores.csv")
df.label = df.label.apply(lambda x: int(x))
df = df.drop_duplicates(subset=["antigen.epitope", "cdr3.beta", "label"], keep="first").reset_index(drop=True)

print("Samples (FULL): ", len(df))
print("Pos: ", len(df[df.label==1]))
print("Neg: ", len(df[df.label==0]))

for i in range(5):
    set_random_seed(i)
    train_df, test_df, test_peps = hard_split_df(
        only_sampled_negs_df, target_col="antigen.epitope", min_ratio=0.15, random_state=i, low=10, high=100
    )
    # training set considering also real negatives
    train_df_full = df.drop(test_df.index)
    train_df_full = train_df_full[~train_df_full["negative.source"].isin(test_peps)]
    for p in test_peps:
        assert not p in train_df_full["negative.source"].unique()

    train_df.to_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b/train/only-sampled-negs/train-{i}.csv", index=False)
    train_df_full.to_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b/train/only-sampled-negs.full/train-{i}.csv", index=False)
    test_df.to_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b/test/only-sampled-negs/test-{i}.csv", index=False)
    
    print("Actual test/train ratio (full): ", len(test_df) / len(train_df_full))
    print("Actual test/train ratio (only sampled negs): ", len(test_df) / len(train_df))

Samples (FULL):  10134
Pos:  3453
Neg:  6681
Tot pep: 266, Possible test pep: 81
Target antigen.epitope sequences: ['LPRRSGAAGA', 'NLSALGIFST', 'SPRWYFYYL', 'TPGPGVRYPL', 'VAANIVLTV', 'CRVLCCYVL', 'FPTKDVAL', 'KLSALGINAV', 'YLEPGPVTA', 'EAAGIGILTV', 'SLLMWITQV', 'VTEHDTLLY', 'LPPIVAKEI', 'QYDPVAALF', 'LLWNGPMAV', 'GTSGSPIIDK', 'CTELKLSDY', 'FYGKTILWF', 'RPPIFIRRL', 'FLKETGGL', 'TAFTIPSI', 'APRGPHGGAASGL', 'FLRGRAYGL', 'RPHERNGFTVL', 'AAFKRSCLK', 'GPGMKARVL', 'QIKVRVDMV', 'ISPRTLNAW', 'VVMSWAPPV', 'YSEHPTFTSQY', 'MLNIPSINV', 'HPKVSSEVHI', 'HPVGEADYFEY', 'FPRPWLHGL']
Actual test/train ratio (full):  0.18043098427489807
Actual test/train ratio (only sampled negs):  0.18043098427489807
Tot pep: 266, Possible test pep: 81
Target antigen.epitope sequences: ['KASEKIFYV', 'DATYQRTRALVR', 'VSFIEFVGW', 'CRVLCCYVL', 'YPLHEQHGM', 'APRGPHGGAASGL', 'TLNAWVKVV', 'FLKETGGL', 'AAGIGILTV', 'YSEHPTFTSQY', 'CLGGLLTMV', 'HPVGEADYFEY', 'RLRPGGKKR', 'GLNKIVRMY', 'FYGKTILWF', 'FLGKIWPSHK', 'KRWIIMGLNK', 'LLWN

In [13]:
for i in range(5):
    to_print = ''
    df = pd.read_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b/test/only-sampled-negs/test-{i}.csv")
    peps = sorted(df['antigen.epitope'].unique())
    for p in peps:
        temp_df = df[df['antigen.epitope']==p]
        pos = len(temp_df[temp_df.label == 1])
        neg = len(temp_df[temp_df.label == 0])
        to_print += p + f' ({pos},{neg}), '
    print(f"Split {i}", to_print)


Split 0 AAFKRSCLK (5,10), APRGPHGGAASGL (5,10), CRVLCCYVL (31,60), CTELKLSDY (4,8), EAAGIGILTV (27,52), FLKETGGL (4,7), FLRGRAYGL (15,33), FPRPWLHGL (30,59), FPTKDVAL (10,20), FYGKTILWF (4,8), GPGMKARVL (4,8), GTSGSPIIDK (19,38), HPKVSSEVHI (25,48), HPVGEADYFEY (24,59), ISPRTLNAW (22,43), KLSALGINAV (5,10), LLWNGPMAV (18,36), LPPIVAKEI (20,40), LPRRSGAAGA (7,14), MLNIPSINV (27,51), NLSALGIFST (18,36), QIKVRVDMV (7,14), QYDPVAALF (11,22), RPHERNGFTVL (22,46), RPPIFIRRL (28,55), SLLMWITQV (5,10), SPRWYFYYL (14,30), TAFTIPSI (13,34), TPGPGVRYPL (33,65), VAANIVLTV (14,27), VTEHDTLLY (10,20), VVMSWAPPV (8,15), YLEPGPVTA (4,8), YSEHPTFTSQY (20,40), 
Split 1 AAGIGILTV (5,10), APRGPHGGAASGL (5,10), ARMILMTHF (14,26), CLGGLLTMV (4,8), CRVLCCYVL (31,60), DATYQRTRALVR (27,59), EAAGIGILTV (27,52), EPLPQGQLTAY (28,69), FLGKIWPSHK (8,16), FLKETGGL (4,7), FPRPWLHGL (30,59), FPTKDVAL (10,20), FYGKTILWF (4,8), GLNKIVRMY (13,26), HPVGEADYFEY (24,59), ILKEPVHGV (7,14), ISPRTLNAW (22,43), IVTDFSVIK (21,41

# Pep+CDR3b+CDR3a+MHC

In [12]:
df = pd.read_csv(DATA_BASE+f"ds.csv")
df.label = df.label.apply(lambda x: int(x))
df = df.drop_duplicates(subset=["antigen.epitope", "cdr3.beta", "cdr3.alpha", "mhc.seq", "label"], keep="first")
df = df.dropna(subset=["antigen.epitope", "cdr3.beta", "cdr3.alpha", "mhc.seq", "label"]).reset_index(drop=True)

print("Samples (FULL): ", len(df))
print("Pos: ", len(df[df.label==1]))
print("Neg: ", len(df[df.label==0]))

for i in range(5):
    set_random_seed(i)
    train_df, test_df, test_peps = hard_split_df(
        only_sampled_negs_df, target_col="antigen.epitope", min_ratio=0.15, random_state=i, low=10, high=100
    )
    # training set considering also real negatives
    train_df_full = df.drop(test_df.index)
    train_df_full = train_df_full[~train_df_full["negative.source"].isin(test_peps)]
    for p in test_peps:
        assert not p in train_df_full["negative.source"].unique()

    train_df.to_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-sampled-negs/train-{i}.csv", index=False)
    train_df_full.to_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-sampled-negs.full/train-{i}.csv", index=False)
    test_df.to_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b+cdr3a+MHC/test/only-sampled-negs/test-{i}.csv", index=False)
    
    print("Actual test/train ratio (full): ", len(test_df) / len(train_df_full))
    print("Actual test/train ratio (only sampled negs): ", len(test_df) / len(train_df))

  interactivity=interactivity, compiler=compiler, result=result)


Samples (FULL):  110266
Pos:  28229
Neg:  82037
Tot pep: 266, Possible test pep: 81
Target antigen.epitope sequences: ['LPRRSGAAGA', 'NLSALGIFST', 'SPRWYFYYL', 'TPGPGVRYPL', 'VAANIVLTV', 'CRVLCCYVL', 'FPTKDVAL', 'KLSALGINAV', 'YLEPGPVTA', 'EAAGIGILTV', 'SLLMWITQV', 'VTEHDTLLY', 'LPPIVAKEI', 'QYDPVAALF', 'LLWNGPMAV', 'GTSGSPIIDK', 'CTELKLSDY', 'FYGKTILWF', 'RPPIFIRRL', 'FLKETGGL', 'TAFTIPSI', 'APRGPHGGAASGL', 'FLRGRAYGL', 'RPHERNGFTVL', 'AAFKRSCLK', 'GPGMKARVL', 'QIKVRVDMV', 'ISPRTLNAW', 'VVMSWAPPV', 'YSEHPTFTSQY', 'MLNIPSINV', 'HPKVSSEVHI', 'HPVGEADYFEY', 'FPRPWLHGL']
Actual test/train ratio (full):  0.014248001692467599
Actual test/train ratio (only sampled negs):  0.18043098427489807
Tot pep: 266, Possible test pep: 81
Target antigen.epitope sequences: ['KASEKIFYV', 'DATYQRTRALVR', 'VSFIEFVGW', 'CRVLCCYVL', 'YPLHEQHGM', 'APRGPHGGAASGL', 'TLNAWVKVV', 'FLKETGGL', 'AAGIGILTV', 'YSEHPTFTSQY', 'CLGGLLTMV', 'HPVGEADYFEY', 'RLRPGGKKR', 'GLNKIVRMY', 'FYGKTILWF', 'FLGKIWPSHK', 'KRWIIMGLNK', '

In [14]:
for i in range(5):
    to_print = ''
    df = pd.read_csv(DATA_BASE+f"ds.vdjdb-high-scores.hard-splits/pep+cdr3b+cdr3a+MHC/test/only-sampled-negs/test-{i}.csv")
    peps = sorted(df['antigen.epitope'].unique())
    for p in peps:
        temp_df = df[df['antigen.epitope']==p]
        pos = len(temp_df[temp_df.label == 1])
        neg = len(temp_df[temp_df.label == 0])
        to_print += p + f' ({pos},{neg}), '
    print(f"Split {i}", to_print)


Split 0 AAFKRSCLK (5,10), APRGPHGGAASGL (5,10), CRVLCCYVL (31,60), CTELKLSDY (4,8), EAAGIGILTV (27,52), FLKETGGL (4,7), FLRGRAYGL (15,33), FPRPWLHGL (30,59), FPTKDVAL (10,20), FYGKTILWF (4,8), GPGMKARVL (4,8), GTSGSPIIDK (19,38), HPKVSSEVHI (25,48), HPVGEADYFEY (24,59), ISPRTLNAW (22,43), KLSALGINAV (5,10), LLWNGPMAV (18,36), LPPIVAKEI (20,40), LPRRSGAAGA (7,14), MLNIPSINV (27,51), NLSALGIFST (18,36), QIKVRVDMV (7,14), QYDPVAALF (11,22), RPHERNGFTVL (22,46), RPPIFIRRL (28,55), SLLMWITQV (5,10), SPRWYFYYL (14,30), TAFTIPSI (13,34), TPGPGVRYPL (33,65), VAANIVLTV (14,27), VTEHDTLLY (10,20), VVMSWAPPV (8,15), YLEPGPVTA (4,8), YSEHPTFTSQY (20,40), 
Split 1 AAGIGILTV (5,10), APRGPHGGAASGL (5,10), ARMILMTHF (14,26), CLGGLLTMV (4,8), CRVLCCYVL (31,60), DATYQRTRALVR (27,59), EAAGIGILTV (27,52), EPLPQGQLTAY (28,69), FLGKIWPSHK (8,16), FLKETGGL (4,7), FPRPWLHGL (30,59), FPTKDVAL (10,20), FYGKTILWF (4,8), GLNKIVRMY (13,26), HPVGEADYFEY (24,59), ILKEPVHGV (7,14), ISPRTLNAW (22,43), IVTDFSVIK (21,41