# Train & Test Splitting

This code splits the dataset into training and test sets using stratify. Then from the training set, the data is split into training and validation folds using stratified K-fold cross-validation. Finally, oversampling was performed using SMOTE (Synthetic Minority Over-sampling Technique), which generates synthetic examples for the minority classes (“low potency” and “medium potency” data) by interpolating between existing samples.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold


In [2]:
# Read descriptors
descriptors_3D = pd.read_csv('../2_descriptors/3D-descriptors-standardized.csv')
descriptors_quantum = pd.read_csv('../2_descriptors/quantum-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_topological = pd.read_csv('../2_descriptors/topological-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_physicochemical = pd.read_csv('../2_descriptors/physicochemical-descriptors-standardized.csv').drop(columns=['Molecule ChEMBL ID'])
descriptors_all = pd.concat([descriptors_3D,descriptors_quantum,descriptors_topological,descriptors_physicochemical],axis=1)

# Read -log(IC50) values
ic50 = pd.read_csv('../1_preprocess/TRPM8-homosapien-compounds-activities-processed.csv')[['Molecule ChEMBL ID','Standard Value','Potency']]
ic50['-logIC50'] = ic50['Standard Value'].apply(lambda x: -np.log(x*1e-9))

# Merge with descriptors
dataset = ic50.drop(columns='Standard Value').merge(descriptors_all, on='Molecule ChEMBL ID')
dataset.to_csv('descriptors_all.csv')

dataset

Unnamed: 0,Molecule ChEMBL ID,Potency,-logIC50,PMI1,PMI2,PMI3,Asphericity,Eccentricity,InertialShapeFactor,NPR1,...,JGI10,JGT,VAdjMat,WPATH,WPOL,Zagreb,HOMO,LUMO,Electronegativity,Hardness
0,CHEMBL3235962,High Potency,16.304425,0.369589,-0.453167,-0.359669,-0.786832,-0.492323,-0.507247,0.674370,...,0.410879,-0.437691,0.066286,-0.248265,-0.025226,-0.041012,-0.564357,0.385010,0.239013,0.988571
1,CHEMBL3235983,High Potency,18.420681,-0.045649,-0.442597,-0.347368,-0.371923,0.097744,-0.407798,0.154252,...,1.801845,1.057619,-0.077703,-0.490616,-0.025226,-0.148730,-0.577388,0.462273,0.216764,1.061151
2,CHEMBL1650511,High Potency,21.607574,-0.542816,0.502877,0.305028,0.800887,0.884693,-0.058050,-0.883505,...,0.012615,0.706414,0.340829,0.041677,0.341398,0.605297,-0.216594,-0.024576,0.161722,0.253996
3,CHEMBL2443068,High Potency,15.283449,-1.170932,1.289978,1.018960,1.930956,1.151781,0.722147,-1.599513,...,0.603190,1.044686,-0.077703,-0.191888,-0.025226,-0.041012,-0.597355,-0.609858,0.666173,0.306054
4,CHEMBL3959823,High Potency,13.954773,-0.491711,-1.122955,-1.256339,-1.154136,-1.342932,-0.188332,1.264532,...,-0.931233,-1.661493,-0.540368,-0.943834,-0.831800,-0.848899,-0.128914,0.491027,-0.109105,0.519126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,CHEMBL207433,Medium Potency,13.345507,-1.118508,1.026894,0.804390,1.791258,1.133224,0.590406,-1.521891,...,0.113261,0.295245,-0.706096,-0.774701,-1.125100,-0.848899,-0.396951,0.330364,0.143925,0.738670
650,CHEMBL1377190,High Potency,14.413348,-1.587974,-0.957778,-1.156982,1.548930,1.093272,2.451851,-1.381422,...,2.562260,-0.096277,-2.320871,-1.744103,-2.004998,-2.087658,-0.417137,-0.744077,0.594426,-0.017832
651,CHEMBL5275535,Medium Potency,13.178934,-1.669100,-1.047621,-1.271443,1.721990,1.123047,3.539572,-1.483223,...,-0.220777,-1.307609,-2.083448,-1.679672,-1.931673,-1.818362,-0.325860,-0.743732,0.530339,-0.132157
652,CHEMBL5269450,High Potency,15.056839,-1.550170,-1.082088,-1.294950,1.218241,1.020134,2.143348,-1.176419,...,-0.477191,0.191819,-2.083448,-1.675279,-1.931673,-1.926081,-0.398230,-0.763832,0.589204,-0.055940


In [11]:
X = dataset.drop(columns=['Molecule ChEMBL ID', '-logIC50', 'Potency']).values  # Convert to NumPy array
y = dataset['-logIC50'].values  # Continuous target for regression, converted to NumPy array
y_class_labels = dataset['Potency'].values  # Labels for classification, converted to NumPy array

# Initial Stratified Train-Test Split
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in strat_split.split(X, y_class_labels):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y_class_train, y_class_test = y_class_labels[train_index], y_class_labels[test_index]

# Apply SMOTE on the Training Set Only for Classification Labels
smote = SMOTE(random_state=42)
X_train_smote, y_class_train_smote = smote.fit_resample(X_train, y_class_train)

# Match `logIC50` values by resampling y_train to align with the indices of X_train_smote
indices = smote.fit_resample(np.arange(len(X_train)).reshape(-1, 1), y_class_train)[0].flatten()
y_train_smote = y_train[indices]

# Saving Split Datasets to CSV
train_data = pd.DataFrame(X_train_smote)
train_data['logIC50'] = y_train_smote
train_data['class'] = y_class_train_smote

test_data = pd.DataFrame(X_test)
test_data['logIC50'] = y_test
test_data['class'] = y_class_test

# Save as CSV files
train_data.to_csv('train_data_smote.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# For stratified k-fold cross-validation with SMOTE on each fold's training data
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
for train_idx, val_idx in skf.split(X_train, y_class_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    y_class_train_fold, y_class_val_fold = y_class_train[train_idx], y_class_train[val_idx]

    # Apply SMOTE to each fold's training data
    X_train_fold_smote, y_class_train_fold_smote = smote.fit_resample(X_train_fold, y_class_train_fold)
    
    # Get indices for y_train_fold_smote alignment
    fold_indices = smote.fit_resample(np.arange(len(X_train_fold)).reshape(-1, 1), y_class_train_fold)[0].flatten()
    y_train_fold_smote = y_train_fold[fold_indices]

    # Save each fold's training and validation sets as CSV files
    train_fold_data = pd.DataFrame(X_train_fold_smote)
    train_fold_data['logIC50'] = y_train_fold_smote
    train_fold_data['class'] = y_class_train_fold_smote

    val_fold_data = pd.DataFrame(X_val_fold)
    val_fold_data['logIC50'] = y_val_fold
    val_fold_data['class'] = y_class_val_fold

    train_fold_data.to_csv(f'train_fold_{fold}_smote.csv', index=False)
    val_fold_data.to_csv(f'val_fold_{fold}.csv', index=False)

    fold += 1
    