# Precision Oncology: Genomic Drug Sensitivity Prediction

# Splitting

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

In [2]:
# import cleaned data
df = pd.read_csv("data/merged_data_clean.csv")

## Step 1: Scaffold Split

In [3]:
# scaffold splitting (preparation)

# define function
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold)

# apply to dataframe
df['scaffold'] = df['smiles'].apply(get_scaffold)

# check top five
print(df['scaffold'].value_counts().head())

scaffold
                                                                                                       2944
c1ccccc1                                                                                               2645
O=C1Cc2cc3n(c(=O)c2CO1)Cc1cc2ccccc2nc1-3                                                               2643
O=C(NCCCNC[C@@H]1CC[C@H](n2ccc3cncnc32)O1)Nc1ccccc1                                                    1911
O=C1CC/C=C/CC(=O)C[C@@H](CCC2CCCCC2)OC(=O)[C@@H]2CCCCN2C(=O)C(=O)C2CCC[C@@H](CC/C=C/C=C/C=C/CCC1)O2    1872
Name: count, dtype: int64


In [4]:
# scaffold splitting (split)

# get unique scaffolds
unique_scaffolds = df['scaffold'].unique()

# shuffle
np.random.seed(42)
np.random.shuffle(unique_scaffolds)

# create lists
train_scaffolds = []
test_scaffolds = []

# obtain 20% for test
train_count = 0
target_train_size = len(df) * 0.8

for scaffold in unique_scaffolds:
    n_molecules = len(df[df['scaffold'] == scaffold])

    if train_count + n_molecules < target_train_size:
        train_scaffolds.append(scaffold)
        train_count += n_molecules
    else:
        test_scaffolds.append(scaffold)

# filter
df_train = df[df['scaffold'].isin(train_scaffolds)].copy()
df_test = df[df['scaffold'].isin(test_scaffolds)].copy()

print(f"Train Shape: {df_train.shape}")
print(f"Test Shape: {df_test.shape}")

Train Shape: (159752, 1790)
Test Shape: (40212, 1790)


In [5]:
# test for leakage
train_unique = set(df_train['scaffold'].unique())
test_unique = set(df_test['scaffold'].unique())

overlap = train_unique.intersection(test_unique)

if len(overlap) > 0:
    print("Scaffold splitting failed. Leakage detected.")
else:
    print("Scaffold splitting successful. No leakage detected.")

Scaffold splitting successful. No leakage detected.


## Step 2: Feature Selection

In [6]:
# feature selection
from sklearn.feature_selection import VarianceThreshold

fp_columns = [c for c in df_train.columns if "FP_" in c]

selector = VarianceThreshold(threshold=0.01)

X_train_fp = df_train[fp_columns]
selector.fit(X_train_fp)

good_fp_columns = X_train_fp.columns[selector.get_support()]

print(f"Original FPs: {len(fp_columns)}")
print(f"Selected FPs: {len(good_fp_columns)}")

# filter
non_fp_columns = [c for c in df_train.columns if c not in fp_columns]
df_train = pd.concat([df_train[non_fp_columns], df_train[fp_columns]], axis=1)
df_test = pd.concat([df_test[non_fp_columns], df_test[fp_columns]], axis=1)

Original FPs: 1024
Selected FPs: 875


## Step 3: Saving Training & Testing Sets

In [7]:
# save final train and test sets
df_train.to_csv('data/train_scaffold_split.csv', index=False)
df_test.to_csv('data/test_scaffold_split.csv', index=False)