# Cancer Metabolic Disease Screening

This notebook performs regression modeling on cancer metabolic disease data to predict IC50 values using machine learning models and feature selection.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR, NuSVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
data = pd.read_csv('cancer_metabolic_disease_data.csv')
X = data.drop(columns=['Compound_ID', 'SMILES', 'IC50 (nM)'])
X = pd.get_dummies(X)
y = data['IC50 (nM)']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
num_features_list = [5, 10]
best_weight = 0
best_mean_test_score = float('-inf')
models = {
    'KNN': KNeighborsRegressor(),
    'Elastic Net': ElasticNet(),
    'Linear Regression': LinearRegression(),
    'SVR Linear': SVR(kernel='linear'),
    'SVR RBF': SVR(kernel='rbf'),
    'Nu-SVR': NuSVR(),
    'Decision Tree': DecisionTreeRegressor(random_state=1),
    'Gradient Boosting': GradientBoostingRegressor(random_state=1),
    'Random Forest': RandomForestRegressor(random_state=1),
    'AdaBoost': AdaBoostRegressor(random_state=1),
    'XGBoost': XGBRegressor(random_state=1)
}

In [21]:
import warnings
warnings.filterwarnings('ignore')

for num_features in num_features_list:
    for model_name, model in models.items():
        print(f"\nNumber of Features: {num_features}")
        print(f"Model: {model_name}")
        feature_selector = SelectKBest(score_func=mutual_info_regression, k=num_features)
        model_pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('feature_selector', feature_selector),
            ('model', model)
        ])
        param_grid = {}
        if model_name == 'KNN':
            param_grid = {'model__n_neighbors': [2, 3], 'model__weights': ['uniform', 'distance']}
        elif model_name == 'Decision Tree':
            param_grid = {'model__max_depth': [None, 10], 'model__min_samples_split': [2, 5], 'model__min_samples_leaf': [1, 2]}
        model_grid_search = GridSearchCV(model_pipeline, param_grid, scoring='neg_mean_squared_error', cv=3)
        model_grid_search.fit(X_train, y_train)
        best_model = model_grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        r2_train = r2_score(y_train, best_model.predict(X_train))
        r2_val = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred) ** 0.5
        mae = mean_absolute_error(y_test, y_pred)
        w_new = ((r2_train + r2_val) / (mse + rmse + mae)) * (1 - abs(r2_train - r2_val)) / (1 + abs(r2_train - r2_val))
        if model_grid_search.best_score_ > best_mean_test_score:
            best_mean_test_score = model_grid_search.best_score_
            best_weight = w_new
        print(f"Best Model: {best_model}")
        print(f"R2 Train Score: {r2_train}")
        print(f"R2 Validation Score: {r2_val}")
        print(f"MAE: {mae}")
        print(f"RMSE: {rmse}")
        print(f"MSE: {mse}")
        print(f"Weight (new): {w_new}")
        print('=' * 50)


Number of Features: 5
Model: KNN
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selector',
                 SelectKBest(k=5,
                             score_func=<function mutual_info_regression at 0x176a6a340>)),
                ('model',
                 KNeighborsRegressor(n_neighbors=2, weights='distance'))])
R2 Train Score: 0.9999997229624605
R2 Validation Score: -13108.530652213356
MAE: 44010.27046194848
RMSE: 87644.0889195315
MSE: 7681486322.534744
Weight (new): 1.706089934380802e-06

Number of Features: 5
Model: Elastic Net
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('feature_selector',
                 SelectKBest(k=5,
                             score_func=<function mutual_info_regression at 0x176a6a340>)),
                ('model', ElasticNet())])
R2 Train Score: 0.5570081596672707
R2 Validation Score: -15537.81605565289
MAE: 81569.64464210602
RMSE: 95419.69454079146
MSE: 9104918106.257948
Weight (new):

In [23]:
print(f"Best Weight: {best_weight}")
print(f"Best Mean Test Score: {best_mean_test_score}")

Best Weight: 1.560471785948715e-05
Best Mean Test Score: -27719155137.821667


# Model Summary 

We compared several machine learning models using 5 or 10 selected features to predict IC50 values for cancer compounds. While some models like KNN and Elastic Net showed high accuracy on training data, they performed poorly on validation data, meaning they couldn’t generalize well to unseen data. In contrast, models like AdaBoost and XGBoost with 10 features gave both strong training and validation performance, showing they can predict reliably on new compounds. Overall, increasing the number of features improved the results when using ensemble models, but simpler models struggled regardless of feature count.