In [3]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    BaggingClassifier,
)
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import joblib
import numpy as np
import pandas as pd

In [2]:
pd_train = pd.read_csv("data/training_class.CSV")
pd_train["label"] = pd_train["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)

In [4]:
pd_test = pd.read_csv("data/testing_class.CSV")
pd_test["label"] = pd_test["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)

In [5]:
feature_sets = {
    20: ["FP890", "FP277", "FP937", "FP130", "FP823", "FP932"],
    50: [
        "FP349",
        "FP890",
        "FP277",
        "FP937",
        "KRFP298",
        "FP1007",
        "EStateFP33",
        "FP130",
        "KRFP297",
        "FP802",
        "FP1006",
        "FP823",
        "FP289",
        "FP932",
        "KRFPC3884",
    ],
    80: [
        "FP168",
        "KRFP297",
        "FP1006",
        "FP802",
        "FP823",
        "KRFPC4757",
        "FP890",
        "FP349",
        "KRFP298",
        "FP289",
        "FP277",
        "FP937",
        "FP1007",
        "EStateFP33",
        "FP932",
        "FP598",
        "KRFPC3389",
        "FP130",
        "KRFPC3884",
    ],
    100: [
        "FP168",
        "KRFP297",
        "FP1006",
        "FP802",
        "FP823",
        "KRFPC4757",
        "FP890",
        "FP349",
        "KRFP298",
        "FP289",
        "FP277",
        "FP937",
        "KRFP4757",
        "FP1007",
        "EStateFP33",
        "FP187",
        "FP932",
        "FP355",
        "FP598",
        "KRFPC3389",
        "FP316",
        "FP130",
        "KRFPC3884",
    ],
}

In [6]:
models = {
    "LogisticRegression": LogisticRegression(),
    "RidgeClassifier": RidgeClassifier(),
    "SGDClassifier": SGDClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "ExtraTrees": ExtraTreesClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "SVM": SVC(probability=True),
    "LinearSVC": LinearSVC(),
    "KNN": KNeighborsClassifier(),
    "GaussianNB": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
}

In [7]:
def select_top_k_features(pd_input_train, pd_input_test, selected_features):
    # train val split with sklearn
    X_train, X_val, y_train, y_val = train_test_split(
        pd_input_train[selected_features],
        pd_input_train["label"],
        test_size=0.2,
        random_state=42,
    )
    X_test = pd_input_test[selected_features]
    y_test = pd_input_test["label"]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [9]:
# Prepare logging results
results = []

# Loop over each feature set and model
for k in feature_sets:
    X_train_k, y_train, X_val_k, y_val, X_test_k, y_test = select_top_k_features(
        pd_train, pd_test, feature_sets[k]
    )

    for model_name, model in models.items():
        # Train model
        model.fit(X_train_k, y_train)

        # Predictions
        train_pred = model.predict(X_train_k)
        val_pred = model.predict(X_val_k)
        test_pred = model.predict(X_test_k)

        # Calculate accuracy
        train_acc = accuracy_score(y_train, train_pred)
        val_acc = accuracy_score(y_val, val_pred)
        test_acc = accuracy_score(y_test, test_pred)

        # Calculate AUC (check if probability is available)
        if hasattr(model, "predict_proba"):
            train_prob = model.predict_proba(X_train_k)[:, 1]
            val_prob = model.predict_proba(X_val_k)[:, 1]
            test_prob = model.predict_proba(X_test_k)[:, 1]

            train_auc = roc_auc_score(y_train, train_prob)
            val_auc = roc_auc_score(y_val, val_prob)
            test_auc = roc_auc_score(y_test, test_prob)
        else:
            train_auc, val_auc, test_auc = None, None, None

        # Log the results
        results.append(
            {
                "top_K": k,
                "model": model_name,
                "train_auc": train_auc,
                "val_auc": val_auc,
                "test_auc": test_auc,
                "train_accuracy": train_acc,
                "val_accuracy": val_acc,
                "test_accuracy": test_acc,
            }
        )

        # Save the best model (you can modify the criterion for "best" as needed)
        if val_auc is not None and val_auc == max(
            [result["val_auc"] for result in results if result["val_auc"] is not None]
        ):
            best_model_path = f"models/best_model_k{k}_{model_name}.pkl"
            joblib.dump(model, best_model_path)



[LightGBM] [Info] Number of positive: 540, number of negative: 452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000486 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 58
[LightGBM] [Info] Number of data points in the train set: 992, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.544355 -> initscore=0.177887
[LightGBM] [Info] Start training from score 0.177887




[LightGBM] [Info] Number of positive: 540, number of negative: 452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 992, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.544355 -> initscore=0.177887
[LightGBM] [Info] Start training from score 0.177887


In [10]:
# Export the results as a CSV report
df_results = pd.DataFrame(results)
df_results.to_csv("models/model_performance_report.csv", index=False)

# Display the report
df_results.sort_values(by="test_auc", ascending=False)

Unnamed: 0,top_K,model,train_auc,val_auc,test_auc,train_accuracy,val_accuracy,test_accuracy
63,100,CatBoost,0.912361,0.701676,0.859659,0.822581,0.630522,0.779720
47,80,CatBoost,0.880449,0.703787,0.858580,0.773185,0.666667,0.779720
61,100,XGBoost,0.958934,0.668294,0.852628,0.889113,0.622490,0.776224
51,100,RandomForest,0.970952,0.659883,0.851619,0.905242,0.602410,0.776224
45,80,XGBoost,0.921649,0.692176,0.849460,0.828629,0.606426,0.758741
...,...,...,...,...,...,...,...,...
34,80,SGDClassifier,,,,0.654234,0.622490,0.702797
41,80,LinearSVC,,,,0.671371,0.626506,0.706294
49,100,RidgeClassifier,,,,0.661290,0.622490,0.730769
50,100,SGDClassifier,,,,0.633065,0.578313,0.660839
