In [15]:
import pandas as pd
import numpy as np
import random
import os
import operator

import sys
import logging
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler(stream=sys.stdout))
logger = logging.getLogger()
logger.setLevel('INFO')

from typing import Tuple, List

In [16]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)

In [17]:
login = os.getlogin( )
DATA_BASE = f"/home/{login}/Git/tc-hard/tc-hard-data/tc-hard/"
RESULTS_BASE = f"/home/{login}/Git/tc-hard/notebooks/notebooks.classification/results/"

In [22]:
def hard_split_df(
        df: pd.DataFrame, target_col: str, min_ratio: float, random_state: float, low: int, high: int
) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
    """ Assume a target column, e.g. `peptide`.
    Then:
        1) Select random sample
        2) All samples sharing the same value of that column
        with the randomly selected sample are used for test
        3)Repeat until test budget (defined by train/test ratio) is
        filled.
    """
    min_test_len = round(min_ratio * len(df))
    test_len = 0
    selected_target_val = []

    train_df = df.copy()
    test_df = pd.DataFrame()
    
    target_count_df = df.groupby([target_col]).size().reset_index(name='counts')
    target_count_df = target_count_df[target_count_df['counts'] >= low]
    target_count_df = target_count_df[target_count_df['counts'] <= high]
    possible_target_val = list(target_count_df[target_col].unique())
    max_target_len = len(possible_target_val)

    while test_len < min_test_len:
#         sample = train_df.sample(n=1, random_state=random_state)
#         target_val = sample[target_col].values[0]
        target_val = np.random.choice(possible_target_val)

        if target_val not in selected_target_val:
            to_test = train_df[train_df[target_col] == target_val]

            train_df = train_df.drop(to_test.index)
            test_df = test_df.append(to_test)
            test_len = len(test_df)

            selected_target_val.append(target_val)
            possible_target_val.remove(target_val)

        if len(selected_target_val) == max_target_len:
            logger.info(f"Possible targets left {possible_target_val}")
            raise Exception('No more values to sample from.')

    logger.info(f"Target {target_col} sequences: {selected_target_val}")

    return train_df, test_df, selected_target_val

# Pep+CDR3b

In [23]:
df = pd.read_csv(DATA_BASE+f"ds.csv")
df.label = df.label.apply(lambda x: int(x))
df = df.drop_duplicates(subset=["antigen.epitope", "cdr3.beta", "label"], keep="first").reset_index(drop=True)

print("Samples (FULL): ", len(df))
print("Pos: ", len(df[df.label==1]))
print("Neg: ", len(df[df.label==0]))

# dataframe with negative samples from assays
only_neg_assays_df = df[df["negative.source"] != "randomized"] 

print("Samples (ONLY RANDOMIZED NEGS): ", len(only_neg_assays_df))
print("Pos: ", len(only_neg_assays_df[only_neg_assays_df.label==1]))
print("Neg: ", len(only_neg_assays_df[only_neg_assays_df.label==0]))

for i in range(5):
    set_random_seed(i)
    train_df, test_df, test_peps = hard_split_df(
        only_neg_assays_df, target_col="antigen.epitope", min_ratio=0.15, random_state=i, low=500, high=10000
    )

    # training set considering also randomized negatives
    train_df_full = df.drop(test_df.index)
    train_df_full = train_df_full[~train_df_full["negative.source"].isin(test_peps)]
    for p in test_peps:
        assert not p in train_df_full["negative.source"].unique()

    train_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/train/only-neg-assays/train-{i}.csv", index=False)
    train_df_full.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/train/only-neg-assays.full/train-{i}.csv", index=False)
    test_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/test/only-neg-assays/test-{i}.csv", index=False)
    
    print("Actual test/train ratio (full): ", len(test_df) / len(train_df_full))
    print("Actual test/train ratio (only neg assays): ", len(test_df) / len(train_df))

  interactivity=interactivity, compiler=compiler, result=result)


Samples (FULL):  528020
Pos:  142244
Neg:  385776
Samples (ONLY RANDOMIZED NEGS):  268961
Pos:  142244
Neg:  126717
Target antigen.epitope sequences: ['WICLLQFAY', 'YVLDHLIVV', 'AELAKNVSLDNVL', 'ELAGIGILTV', 'FLNGSCGSV', 'VPHVGEIPVAYRKVLL', 'HTTDPSFLGRY', 'LSPRWYFYYL', 'MPASWVMRI', 'VQELYSPIFLIV', 'RAKFKQLL', 'FVDGVPFVV', 'RTQSPRRR', 'SEHDYQIGGYTEKW', 'KLPDDFTGCV', 'AVFDRKSDAK', 'STLPETAVVRR', 'LLWNGPMAV', 'TVLSFCAFAV']
Target antigen.epitope sequences: ['WICLLQFAY', 'YVLDHLIVV', 'AELAKNVSLDNVL', 'ELAGIGILTV', 'FLNGSCGSV', 'VPHVGEIPVAYRKVLL', 'HTTDPSFLGRY', 'LSPRWYFYYL', 'MPASWVMRI', 'VQELYSPIFLIV', 'RAKFKQLL', 'FVDGVPFVV', 'RTQSPRRR', 'SEHDYQIGGYTEKW', 'KLPDDFTGCV', 'AVFDRKSDAK', 'STLPETAVVRR', 'LLWNGPMAV', 'TVLSFCAFAV']
Actual test/train ratio (full):  0.08302908479304262
Actual test/train ratio (only neg assays):  0.1771700929180107
Target antigen.epitope sequences: ['TPRVTGGGAM', 'WICLLQFAY', 'HTTDPSFLGRY', 'FPPTSFGPL', 'GDAALALLLLDRLNQL', 'IMLIIFWFSL', 'FLNGSCGSV', 'LLLDDFVEII', '

In [8]:
for i in range(5):
    to_print = ''
    df = pd.read_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b/test/only-neg-assays/test-{i}.csv")
    peps = sorted(df['antigen.epitope'].unique())
    for p in peps:
        temp_df = df[df['antigen.epitope']==p]
        pos = len(temp_df[temp_df.label == 1])
        neg = len(temp_df[temp_df.label == 0])
        to_print += p + f' ({pos},{neg}), '
    print(f"Split {i}", to_print)

Split 0 AELAKNVSLDNVL (1794,0), AVFDRKSDAK (1967,0), ELAGIGILTV (2074,2), FLNGSCGSV (2568,0), FVDGVPFVV (2705,0), HTTDPSFLGRY (5787,0), KLPDDFTGCV (1319,0), LLWNGPMAV (2559,0), LSPRWYFYYL (1751,0), MPASWVMRI (777,0), RAKFKQLL (996,0), RTQSPRRR (51,763), SEHDYQIGGYTEKW (3424,0), STLPETAVVRR (924,41), TVLSFCAFAV (613,0), VPHVGEIPVAYRKVLL (528,0), VQELYSPIFLIV (1063,0), WICLLQFAY (590,0), YVLDHLIVV (8184,0), 
Split 1 AELAKNVSLDNVL (1794,0), AVFDRKSDAK (1967,0), FLNGSCGSV (2568,0), FLYALALLL (32,823), FPPTSFGPL (681,0), GDAALALLLLDRLNQL (609,0), GMEVTPSGTWLTY (995,0), HTTDPSFLGRY (5787,0), IMDQVPFSV (62,522), IMLIIFWFSL (1278,0), LLFGYPVYV (79,1358), LLLDDFVEII (968,0), LLWNGPMAV (2559,0), LPRRSGAAGA (2138,0), LSPRWYFYYL (1751,0), QLMCQPILLL (980,0), RQLLFVVEV (892,0), RTQSPRRR (51,763), SELVIGAVIL (900,0), TLIGDCATV (568,0), TPRVTGGGAM (2557,0), TVLSFCAFAV (613,0), WICLLQFAY (590,0), YEQYIKWPWYI (537,0), YLQPRTFLL (687,0), YVLDHLIVV (8184,0), 
Split 2 EAAGIGILTV (505,0), FLNGSCGSV (2568,0

# Pep+CDR3b+CDR3a+MHC

In [9]:
df = pd.read_csv(DATA_BASE+f"ds.csv")
df.label = df.label.apply(lambda x: int(x))
df = df.drop_duplicates(subset=["antigen.epitope", "cdr3.beta", "cdr3.alpha", "mhc.seq", "label"], keep="first")
df = df.dropna(subset=["antigen.epitope", "cdr3.beta", "cdr3.alpha", "mhc.seq", "label"]).reset_index(drop=True)

print("Samples (FULL): ", len(df))
print("Pos: ", len(df[df.label==1]))
print("Neg: ", len(df[df.label==0]))

# dataframe with negative samples from assays
only_neg_assays_df = df[df["negative.source"] != "randomized"]    

print("Samples (ONLY RANDOMIZED NEGS): ", len(only_neg_assays_df))
print("Pos: ", len(only_neg_assays_df[only_neg_assays_df.label==1]))
print("Neg: ", len(only_neg_assays_df[only_neg_assays_df.label==0]))

for i in range(5):
    set_random_seed(i)
    train_df, test_df, test_peps = hard_split_df(
        only_neg_assays_df, target_col="antigen.epitope", min_ratio=0.15, random_state=i, low=100, high=5000
    )
    
    # training set considering also randomized negatives
    train_df_full = df.drop(test_df.index)
    train_df_full = train_df_full[~train_df_full["negative.source"].isin(test_peps)]
    for p in test_peps:
        assert not p in train_df_full["negative.source"].unique()
        
    train_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-neg-assays/train-{i}.csv", index=False)
    train_df_full.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-neg-assays.full/train-{i}.csv", index=False)
    test_df.to_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/test/only-neg-assays/test-{i}.csv", index=False)
    

    print("Actual test/train ratio (full): ", len(test_df) / len(train_df_full))
    print("Actual test/train ratio (only neg assays): ", len(test_df) / len(train_df))

  interactivity=interactivity, compiler=compiler, result=result)


Samples (FULL):  110266
Pos:  28229
Neg:  82037
Samples (ONLY RANDOMIZED NEGS):  73814
Pos:  28229
Neg:  45585
Target antigen.epitope sequences: ['MLDLQPETT', 'RPRGEVRFL', 'YLQPRTFLL', 'AVFDRKSDAK', 'FLYALALLL', 'IMDQVPFSV', 'LLWNGPMAV', 'RLRAEAQVK', 'KTWGQYWQV', 'RAKFKQLL', 'LTDEMIAQY', 'SLFNTVATL', 'SLLMWITQV', 'YLLEMLWRL', 'ELAGIGILTV', 'RTLNAWVKV', 'TTDPSFLGRY', 'YVLDHLIVV', 'SPRWYFYYL', 'DATYQRTRALVR', 'IVTDFSVIK', 'KVLEYVIKV', 'LLFGYPVYV']
Actual test/train ratio (full):  0.11668557076885684
Actual test/train ratio (only neg assays):  0.18496757207988185
Target antigen.epitope sequences: ['IMDQVPFSV', 'MLDLQPETT', 'RLRAEAQVK', 'LLFGYPVYV', 'LTDEMIAQY', 'RPRGEVRFL', 'IVTDFSVIK', 'YLLEMLWRL', 'AVFDRKSDAK', 'DATYQRTRALVR', 'FLCMKALLL', 'YLQPRTFLL', 'RTLNAWVKV', 'RMFPNAPYL', 'TTDPSFLGRY', 'KTWGQYWQV', 'RAKFKQLL', 'SPRWYFYYL', 'SLLMWITQV', 'KVLEYVIKV', 'YVLDHLIVV', 'SLFNTVATL', 'ELAGIGILTV', 'FLYALALLL']
Actual test/train ratio (full):  0.11434952653333467
Actual test/train ratio (onl

In [10]:
for i in range(5):
    to_print = ''
    df = pd.read_csv(DATA_BASE+f"ds.hard-splits/pep+cdr3b+cdr3a+MHC/test/only-neg-assays/test-{i}.csv")
    peps = sorted(df['antigen.epitope'].unique())
    for p in peps:
        temp_df = df[df['antigen.epitope']==p]
        pos = len(temp_df[temp_df.label == 1])
        neg = len(temp_df[temp_df.label == 0])
        to_print += p + f' ({pos},{neg}), '
    print(f"Split {i}", to_print)


Split 0 AVFDRKSDAK (1852,0), DATYQRTRALVR (100,0), ELAGIGILTV (530,2), FLYALALLL (39,827), IMDQVPFSV (23,483), IVTDFSVIK (747,0), KTWGQYWQV (18,411), KVLEYVIKV (8,166), LLFGYPVYV (74,1178), LLWNGPMAV (671,0), LTDEMIAQY (131,0), MLDLQPETT (14,333), RAKFKQLL (1268,0), RLRAEAQVK (442,0), RPRGEVRFL (116,0), RTLNAWVKV (51,572), SLFNTVATL (22,210), SLLMWITQV (12,146), SPRWYFYYL (142,0), TTDPSFLGRY (254,0), YLLEMLWRL (13,259), YLQPRTFLL (267,0), YVLDHLIVV (141,0), 
Split 1 AVFDRKSDAK (1852,0), DATYQRTRALVR (100,0), ELAGIGILTV (530,2), FLCMKALLL (136,0), FLYALALLL (39,827), IMDQVPFSV (23,483), IVTDFSVIK (747,0), KTWGQYWQV (18,411), KVLEYVIKV (8,166), LLFGYPVYV (74,1178), LTDEMIAQY (131,0), MLDLQPETT (14,333), RAKFKQLL (1268,0), RLRAEAQVK (442,0), RMFPNAPYL (14,314), RPRGEVRFL (116,0), RTLNAWVKV (51,572), SLFNTVATL (22,210), SLLMWITQV (12,146), SPRWYFYYL (142,0), TTDPSFLGRY (254,0), YLLEMLWRL (13,259), YLQPRTFLL (267,0), YVLDHLIVV (141,0), 
Split 2 AVFDRKSDAK (1852,0), DATYQRTRALVR (100,0), ELA

In [14]:
len(test_df[test_df.label==0]) / len(test_df[test_df.label==1])

0.001065200158541419