# Train & Test Splitting

This code splits the dataset into training (85%) and test (15%) sets using stratify. Stratification is based on 3 labels: high potency, medium potency, and low potency.

Then from the training set, the data is split into training and validation folds using stratified K-fold cross-validation.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
import numpy as np

seed=345 #seed to use in all splits

In [2]:
# Read descriptors
descriptors_3D = pd.read_csv('../2_descriptors/3D-descriptors-standardized.csv')
descriptors_quantum = pd.read_csv('../2_descriptors/quantum-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_topological = pd.read_csv('../2_descriptors/topological-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_physicochemical = pd.read_csv('../2_descriptors/physicochemical-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_all = pd.concat([descriptors_3D,descriptors_quantum,descriptors_topological,descriptors_physicochemical],axis=1)

# Read -log(IC50) values
ic50 = pd.read_csv('../1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv')[['Molecule ChEMBL ID','Standard Value','Potency']]
ic50['-logIC50'] = ic50['Standard Value'].apply(lambda x: -np.log(x*1e-9))

# Merge with descriptors
dataset = ic50.drop(columns='Standard Value').merge(descriptors_all, on='Molecule ChEMBL ID')
dataset.to_csv('descriptors_all.csv')

dataset.head()

Unnamed: 0,Molecule ChEMBL ID,Potency,-logIC50,PMI1,PMI2,PMI3,Asphericity,Eccentricity,InertialShapeFactor,NPR1,...,HAcceptors,HDonors,heteroatoms,rotatableBonds,saturatedCarbocycles,saturatedHeterocycles,satureatedRings,ringCount,molLogP,molMR
0,CHEMBL3235962,High Potency \n(less than 1 $\mu$M),16.304425,0.369589,-0.453167,-0.359669,-0.786832,-0.492323,-0.507247,0.67437,...,-1.25873,-0.131926,-0.580284,-1.503645,-0.374196,-0.315596,-0.499087,0.265767,0.851695,-0.158908
1,CHEMBL3235983,High Potency \n(less than 1 $\mu$M),18.420681,-0.045649,-0.442597,-0.347368,-0.371923,0.097744,-0.407798,0.154252,...,-1.742971,-0.131926,0.362978,-1.503645,-0.374196,-0.315596,-0.499087,-0.787636,0.615532,-0.912202
2,CHEMBL1650511,High Potency \n(less than 1 $\mu$M),21.607574,-0.542816,0.502877,0.305028,0.800887,0.884693,-0.05805,-0.883505,...,-0.774489,-0.131926,0.362978,-1.503645,2.229254,-0.315596,1.579913,1.319169,1.89026,-0.192731
3,CHEMBL2443068,High Potency \n(less than 1 $\mu$M),15.283449,-1.170932,1.289978,1.01896,1.930956,1.151781,0.722147,-1.599513,...,-0.774489,-0.131926,0.048557,-1.945814,-0.374196,2.960593,1.579913,0.265767,0.567644,-0.405199
4,CHEMBL3959823,High Potency \n(less than 1 $\mu$M),13.954773,-0.491711,-1.122955,-1.256339,-1.154136,-1.342932,-0.188332,1.264532,...,-1.25873,-0.131926,-1.523545,0.265031,-0.374196,-0.315596,-0.499087,-0.787636,-0.617269,-0.365892


In [3]:
X = dataset.drop(columns=['Molecule ChEMBL ID', '-logIC50', 'Potency']).values  # Features array
y = dataset['-logIC50'].values  # Continuous target for regression

# Perform 85%-15% train-test split ensuring stratification by 'Potency'
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=dataset['Potency']
)

# Save the train and test sets as CSV
train_df = pd.concat([pd.DataFrame(X_train, columns=dataset.columns.drop(['Molecule ChEMBL ID', '-logIC50', 'Potency'])), pd.Series(y_train, name='-logIC50')], axis=1)
test_df = pd.concat([pd.DataFrame(X_test, columns=dataset.columns.drop(['Molecule ChEMBL ID', '-logIC50', 'Potency'])), pd.Series(y_test, name='-logIC50')], axis=1)
train_df.to_csv('train_set.csv', index=False)
test_df.to_csv('test_set.csv', index=False)

# Apply 5-fold cross-validation on the training set
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    # Save each fold as CSV
    fold_train_df = pd.concat([pd.DataFrame(X_train_fold, columns=dataset.columns.drop(['Molecule ChEMBL ID', '-logIC50', 'Potency'])), pd.Series(y_train_fold, name='-logIC50')], axis=1)
    fold_val_df = pd.concat([pd.DataFrame(X_val_fold, columns=dataset.columns.drop(['Molecule ChEMBL ID', '-logIC50', 'Potency'])), pd.Series(y_val_fold, name='-logIC50')], axis=1)
    
    fold_train_df.to_csv(f'train_fold_{fold}.csv', index=False)
    fold_val_df.to_csv(f'val_fold_{fold}.csv', index=False)
    
    fold += 1