In [1]:
import pandas as pd
import numpy as np
import random
import os
import operator

import sys
import logging
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler(stream=sys.stdout))
logger = logging.getLogger()
logger.setLevel('INFO')

from typing import Tuple, List

In [2]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)

In [3]:
login = os.getlogin( )
DATA_BASE = f"/home/{login}/Git/tc-hard/tc-hard-data/tc-hard/"
RESULTS_BASE = f"/home/{login}/Git/tc-hard/notebooks/notebooks.classification/results/"

In [4]:
def hard_split_df(
        df: pd.DataFrame, target_col: str, min_ratio: float, random_state: float, low: int, high: int
) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
    """ Assume a target column, e.g. `peptide`.
    Then:
        1) Select random sample
        2) All samples sharing the same value of that column
        with the randomly selected sample are used for test
        3)Repeat until test budget (defined by train/test ratio) is
        filled.
    """
    min_test_len = round(min_ratio * len(df))
    test_len = 0
    selected_target_val = []

    train_df = df.copy()
    test_df = pd.DataFrame()
    
    target_count_df = df.groupby([target_col]).size().reset_index(name='counts')
    target_count_df = target_count_df[target_count_df['counts'] >= low]
    target_count_df = target_count_df[target_count_df['counts'] <= high]
    possible_target_val = list(target_count_df[target_col].unique())
    max_target_len = len(possible_target_val)

    while test_len < min_test_len:
#         sample = train_df.sample(n=1, random_state=random_state)
#         target_val = sample[target_col].values[0]
        target_val = np.random.choice(possible_target_val)

        if target_val not in selected_target_val:
            to_test = train_df[train_df[target_col] == target_val]

            train_df = train_df.drop(to_test.index)
            test_df = test_df.append(to_test)
            test_len = len(test_df)

            selected_target_val.append(target_val)
            possible_target_val.remove(target_val)

        if len(selected_target_val) == max_target_len:
            logger.info(f"Possible targets left {possible_target_val}")
            raise Exception('No more values to sample from.')

    logger.info(f"Target {target_col} sequences: {selected_target_val}")

    return train_df, test_df, selected_target_val

# Pep+CDR3b

In [30]:
df = pd.read_csv(DATA_BASE+f"ds.csv")
df.label = df.label.apply(lambda x: int(x))
df = df.drop_duplicates(subset=["antigen.epitope", "cdr3.beta", "label"], keep="first").reset_index(drop=True)

print("Samples (FULL): ", len(df))
print("Pos: ", len(df[df.label==1]))
print("Neg: ", len(df[df.label==0]))

# dataframe with only randomized negative samples
only_sampled_negs_df = df[df["negative.source"] != "mira"]
only_sampled_negs_df = only_sampled_negs_df[only_sampled_negs_df["negative.source"] != "iedb"]
only_sampled_negs_df = only_sampled_negs_df[only_sampled_negs_df["negative.source"] != "nettcr-2.0"]    

print("Samples (ONLY RANDOMIZED NEGS): ", len(only_sampled_negs_df))
print("Pos: ", len(only_sampled_negs_df[only_sampled_negs_df.label==1]))
print("Neg: ", len(only_sampled_negs_df[only_sampled_negs_df.label==0]))

for i in range(5):
    set_random_seed(i)
    train_df, test_df, test_peps = hard_split_df(
        only_sampled_negs_df, target_col="antigen.epitope", min_ratio=0.15, random_state=i, low=500, high=10000
    )
    # training set considering also real negatives
    train_df_full = df.drop(test_df.index)
    train_df_full = train_df_full[~train_df_full["negative.source"].isin(test_peps)]
    for p in test_peps:
        assert not p in train_df_full["negative.source"].unique()

    train_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/train/only-sampled-negs/train-{i}.csv", index=False)
    train_df_full.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/train/only-sampled-negs.full/train-{i}.csv", index=False)
    test_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/test/only-sampled-negs/test-{i}.csv", index=False)
    
    print("Actual test/train ratio (full): ", len(test_df) / len(train_df_full))
    print("Actual test/train ratio (only sampled negs): ", len(test_df) / len(train_df))

  interactivity=interactivity, compiler=compiler, result=result)


Samples (FULL):  528020
Pos:  142244
Neg:  385776
Samples (ONLY RANDOMIZED NEGS):  401303
Pos:  142244
Neg:  259059
Target antigen.epitope sequences: ['KRWIILGLNK', 'LITLATCELYHYQECV', 'RPHERNGFTVL', 'SELVIGAVIL', 'SEVGPEHSLAEY', 'CINGVCWTV', 'VPHVGEIPVAYRKVLL', 'FRCPRRFCF', 'KAYNVTQAF', 'YFPLQSYGF', 'STLPETAVVRR', 'YLNTLTLAV', 'YLQPRTFLL', 'EIYKRWII', 'RLRAEAQVK', 'SPFHPLADNKFAL', 'KPLEFGATSAAL', 'LSPRWYFYYL', 'YIFFASFYY', 'KLSYGIATV', 'GTSGSPIINR', 'YEDFLEYHDVRVVL', 'VLWAHGFEL', 'CRVLCCYVL', 'FVDGVPFVV', 'VLPPLLTDEMIAQYT', 'NRDVDTDFVNEFYAY', 'TTDPSFLGRY', 'RAKFKQLL', 'ITEEVGHTDLMAAY', 'FTISVTTEIL', 'GDAALALLLLDRLNQL']
Actual test/train ratio (full):  0.1297179449326153
Actual test/train ratio (only sampled negs):  0.17796779325689663
Target antigen.epitope sequences: ['KAFSPEVIPMF', 'EAAGIGILTV', 'SNEKQEILGTVSWNL', 'CINGVCWTV', 'SYFIASFRLFA', 'APKEIIFLEGETL', 'VLHSYFTSDYYQLY', 'RSVASQSIIAYTMSL', 'FLPFFSNVTWFHAI', 'ALRKVPTDNYITTY', 'VLPFNDGVYFASTEK', 'TLIGDCATV', 'AYKTFPPTEPK', 'ILGLP

In [21]:
for i in range(5):
    to_print = ''
    df = pd.read_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/test/only-sampled-negs/test-{i}.csv")
    peps = sorted(df['antigen.epitope'].unique())
    for p in peps:
        temp_df = df[df['antigen.epitope']==p]
        pos = len(temp_df[temp_df.label == 1])
        neg = len(temp_df[temp_df.label == 0])
        to_print += p + f' ({pos},{neg}), '
    print(f"Split {i}", to_print)


Split 0 CINGVCWTV (186,385), CRVLCCYVL (435,859), EIYKRWII (180,359), FRCPRRFCF (266,529), FTISVTTEIL (198,395), FVDGVPFVV (2705,5093), GDAALALLLLDRLNQL (609,1191), GTSGSPIINR (173,345), ITEEVGHTDLMAAY (180,354), KAYNVTQAF (807,1579), KLSYGIATV (2458,4643), KPLEFGATSAAL (362,713), KRWIILGLNK (401,837), LITLATCELYHYQECV (251,499), LSPRWYFYYL (1751,3376), NRDVDTDFVNEFYAY (285,566), RAKFKQLL (996,3036), RLRAEAQVK (464,953), RPHERNGFTVL (207,414), SELVIGAVIL (900,1731), SEVGPEHSLAEY (270,534), SPFHPLADNKFAL (248,492), STLPETAVVRR (924,1802), TTDPSFLGRY (244,483), VLPPLLTDEMIAQYT (674,1325), VLWAHGFEL (731,1446), VPHVGEIPVAYRKVLL (528,1042), YEDFLEYHDVRVVL (874,1689), YFPLQSYGF (398,786), YIFFASFYY (353,703), YLNTLTLAV (432,860), YLQPRTFLL (687,1433), 
Split 1 ALRKVPTDNYITTY (346,682), APKEIIFLEGETL (1783,3356), AYKTFPPTEPK (337,663), CINGVCWTV (186,385), CTFEYVSQPFLM (196,389), EAAGIGILTV (505,1021), ELAGIGILTV (2074,4066), FIAGLIAIV (204,407), FLPFFSNVTWFHAI (299,592), FLPRVFSAV (867,1704

# Pep+CDR3b+CDR3a+MHC

In [34]:
df = pd.read_csv(DATA_BASE+f"ds.csv")
df.label = df.label.apply(lambda x: int(x))
df = df.drop_duplicates(subset=["antigen.epitope", "cdr3.beta", "cdr3.alpha", "mhc.seq", "label"], keep="first")
df = df.dropna(subset=["antigen.epitope", "cdr3.beta", "cdr3.alpha", "mhc.seq", "label"]).reset_index(drop=True)

print("Samples (FULL): ", len(df))
print("Pos: ", len(df[df.label==1]))
print("Neg: ", len(df[df.label==0]))

# dataframe with only randomized negative samples
only_sampled_negs_df = df[df["negative.source"] != "mira"]
only_sampled_negs_df = only_sampled_negs_df[only_sampled_negs_df["negative.source"] != "iedb"]
only_sampled_negs_df = only_sampled_negs_df[only_sampled_negs_df["negative.source"] != "nettcr-2.0"]    

print("Samples (ONLY RANDOMIZED NEGS): ", len(only_sampled_negs_df))
print("Pos: ", len(only_sampled_negs_df[only_sampled_negs_df.label==1]))
print("Neg: ", len(only_sampled_negs_df[only_sampled_negs_df.label==0]))

for i in range(5):
    set_random_seed(i)
    train_df, test_df, test_peps = hard_split_df(
        only_sampled_negs_df, target_col="antigen.epitope", min_ratio=0.15, random_state=i, low=100, high=5000
    )
    # training set considering also real negatives
    train_df_full = df.drop(test_df.index)
    train_df_full = train_df_full[~train_df_full["negative.source"].isin(test_peps)]
    for p in test_peps:
        assert not p in train_df_full["negative.source"].unique()

    train_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-sampled-negs/train-{i}.csv", index=False)
    train_df_full.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-sampled-negs.full/train-{i}.csv", index=False)
    test_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/test/only-sampled-negs/test-{i}.csv", index=False)
    
    print("Actual test/train ratio (full): ", len(test_df) / len(train_df_full))
    print("Actual test/train ratio (only sampled negs): ", len(test_df) / len(train_df))

  interactivity=interactivity, compiler=compiler, result=result)


Samples (FULL):  110266
Pos:  28229
Neg:  82037
Samples (ONLY RANDOMIZED NEGS):  64681
Pos:  28229
Neg:  36452
Target antigen.epitope sequences: ['AVFDRKSDAK', 'DATYQRTRALVR', 'EAAGIGILTV', 'FLRGRAYGL', 'LLWNGPMAV', 'NLVPMVATV', 'PKYVKQNTLKLAT', 'FEDLRVSSF', 'RPRGEVRFL', 'RTLNAWVKV', 'IVTDFSVIK']
Actual test/train ratio (full):  0.11402303495655688
Actual test/train ratio (only sampled negs):  0.21136810562786779
Target antigen.epitope sequences: ['VLFGLGFAI', 'FLRGRAYGL', 'FEDLRVLSF', 'FLASKIGRLV', 'FTSDYYQLY', 'EAAGIGILTV', 'KLVALGINAV', 'AVFDRKSDAK', 'LLWNGPMAV', 'CINGVCWTV', 'KLSALGINAV', 'FLYALALLL', 'LTDEMIAQY', 'YVLDHLIVV', 'FLCMKALLL', 'VVMSWAPPV', 'RLRAEAQVK']
Actual test/train ratio (full):  0.10933821606068532
Actual test/train ratio (only sampled negs):  0.2019586345306896
Target antigen.epitope sequences: ['YLQPRTFLL', 'GLCTLVAML', 'FEDLRVLSF', 'LTDEMIAQY', 'KLVALGINAV', 'FLRGRAYGL', 'FEDLRLLSF', 'YVLDHLIVV', 'VLFGLGFAI', 'FTSDYYQLY', 'NYNYLYRLF', 'KMVAVFYTT', 'SPRWYFYYL',

In [22]:
for i in range(5):
    to_print = ''
    df = pd.read_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/test/only-sampled-negs/test-{i}.csv")
    peps = sorted(df['antigen.epitope'].unique())
    for p in peps:
        temp_df = df[df['antigen.epitope']==p]
        pos = len(temp_df[temp_df.label == 1])
        neg = len(temp_df[temp_df.label == 0])
        to_print += p + f' ({pos},{neg}), '
    print(f"Split {i}", to_print)


Split 0 AVFDRKSDAK (1852,3009), DATYQRTRALVR (100,200), EAAGIGILTV (42,84), FEDLRVSSF (34,68), FLRGRAYGL (43,86), IVTDFSVIK (747,1332), LLWNGPMAV (671,1306), NLVPMVATV (348,677), PKYVKQNTLKLAT (62,124), RPRGEVRFL (116,232), RTLNAWVKV (51,102), 
Split 1 AVFDRKSDAK (1852,3009), CINGVCWTV (84,166), EAAGIGILTV (42,84), FEDLRVLSF (45,90), FLASKIGRLV (35,70), FLCMKALLL (136,270), FLRGRAYGL (43,86), FLYALALLL (39,78), FTSDYYQLY (38,76), KLSALGINAV (45,90), KLVALGINAV (66,132), LLWNGPMAV (671,1306), LTDEMIAQY (131,261), RLRAEAQVK (442,832), VLFGLGFAI (35,69), VVMSWAPPV (41,82), YVLDHLIVV (141,281), 
Split 2 CTELKLSDY (61,122), EAAGIGILTV (42,84), ELAGIGILTV (530,1028), FEDLRLLSF (43,86), FEDLRVLSF (45,90), FLASKIGRLV (35,70), FLCMKALLL (136,270), FLRGRAYGL (43,86), FTSDYYQLY (38,76), GLCTLVAML (399,761), KLVALGINAV (66,132), KMVAVFYTT (42,83), LTDEMIAQY (131,261), NYNYLYRLF (35,70), PKYVKQNTLKLAT (62,124), RAKFKQLL (1268,2297), SPRWYFYYL (142,281), VLFGLGFAI (35,69), YLQPRTFLL (267,528), YVLDH