In [12]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
import xgboost as xgb
from joblib import dump
from itertools import combinations
import time

In [2]:
pd_train = pd.read_csv("data/training_class.CSV")
pd_train["label"] = pd_train["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)

In [3]:
pd_test = pd.read_csv("data/testing_class.CSV")
pd_test["label"] = pd_test["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)

In [4]:
# train val split with sklearn
features = pd_train.columns[1:-1]
X_train, X_val, y_train, y_val = train_test_split(
    pd_train[features], pd_train["label"], test_size=0.2, random_state=42
)
X_test = pd_test[features]
y_test = pd_test["label"]

print("Train size: ", X_train.shape[0])
print("Val size: ", X_val.shape[0])
print("Test size: ", X_test.shape[0])

Train size:  992
Val size:  249
Test size:  286


In [5]:
# Check the actual column names in the dataset
actual_columns = X_train.columns

# Update the fingerprints dictionary with existing column names
feature_groups = {
    "FP": [col for col in actual_columns if col.startswith("FP")],
    "ExtFP": [col for col in actual_columns if col.startswith("ExtFP")],
    "EstateFP": [col for col in actual_columns if col.startswith("EstateFP")],
    "GraphFP": [col for col in actual_columns if col.startswith("GraphFP")],
    "MACCSFP": [col for col in actual_columns if col.startswith("MACCSFP")],
    "PubchemFP": [col for col in actual_columns if col.startswith("PubchemFP")],
    "SubFP": [col for col in actual_columns if col.startswith("SubFP")],
    "SubFPC": [col for col in actual_columns if col.startswith("SubFPC")],
    "KRFP": [col for col in actual_columns if col.startswith("KRFP")],
    "KRFPC": [col for col in actual_columns if col.startswith("KRFPC")],
    "AD2D": [col for col in actual_columns if col.startswith("AD2D")],
    "APC2D": [col for col in actual_columns if col.startswith("APC2D")],
}
len(feature_groups["FP"])

1024

In [6]:
# Create directories if they don't exist
def create_directories(iteration):
    model_dir = f"models/iteration_{iteration}"
    log_dir = f"logs/iteration_{iteration}"
    feat_imp_dir = f"feat_imp/iteration_{iteration}"

    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(feat_imp_dir, exist_ok=True)

    return model_dir, log_dir, feat_imp_dir


# Prepare the dataset by dropping feature groups
def drop_feature_groups(X, feature_groups, drop_list):
    features_to_drop = [item for i in drop_list for item in feature_groups[i]]
    return X.drop(features_to_drop, axis=1)

In [15]:
# Train the XGBoost model and save the logs, model, and feature importance
def train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test, drop_list_str):
    param_init = {
        "objective": "binary:logistic",
        "booster": "gbtree",
        "max_depth": 4,
        "n_estimators": 300,
        "learning_rate": 0.025,
        "subsample": 0.7,
        "colsample_bytree": 0.3,
        "colsample_bylevel": 0.5,
        # "silent": True,
        "n_jobs": 14,
        #
        "tree_method": "hist",
        "grow_policy": "lossguide",
        #
        "eval_metric": "auc",
        "early_stopping_rounds": 100,
    }

    param_fit = {
        "verbose": 100,
        "eval_set": [(X_train, y_train), (X_val, y_val), (X_test, y_test)],
    }

    # Train 5 times for each drop set
    for run in range(5):
        print(f"Iteration {run}")
        start_time = time.time()

        model_dir, log_dir, feat_imp_dir = create_directories(run)

        # Define the XGBoost model
        param_init["random_state"] = run + 42
        model = xgb.XGBClassifier(**param_init)

        # Train the model
        model.fit(X_train, y_train, **param_fit)

        # Make predictions
        y_train_pred = model.predict_proba(X_train)[:, 1]
        y_val_pred = model.predict_proba(X_val)[:, 1]
        y_test_pred = model.predict_proba(X_test)[:, 1]

        # Calculate log loss and AUC
        train_logloss = log_loss(y_train, y_train_pred)
        val_logloss = log_loss(y_val, y_val_pred)
        test_logloss = log_loss(y_test, y_test_pred)
        train_auc = roc_auc_score(y_train, y_train_pred)
        val_auc = roc_auc_score(y_val, y_val_pred)
        test_auc = roc_auc_score(y_test, y_test_pred)

        # Save model
        model_path = f"{model_dir}/xgboost_drop_{drop_list_str}.joblib"
        dump(model, model_path)

        # Save evaluation metrics
        log_path = f"{log_dir}/xgboost_drop_{drop_list_str}.txt"
        with open(log_path, "w") as f:
            f.write(f"Train Logloss: {train_logloss}\n")
            f.write(f"Validation Logloss: {val_logloss}\n")
            f.write(f"Test Logloss: {test_logloss}\n")
            f.write(f"Train AUC: {train_auc}\n")
            f.write(f"Validation AUC: {val_auc}\n")
            f.write(f"Test AUC: {test_auc}\n")

        # Save feature importance
        feat_imp_path = f"{feat_imp_dir}/xgboost_drop_{drop_list_str}.csv"
        feature_importances = pd.DataFrame(
            {"Feature": X_train.columns, "Importance": model.feature_importances_}
        )
        feature_importances.to_csv(feat_imp_path, index=False)

        # save time
        end_time = time.time()
        print(f"Time taken: {end_time - start_time:.2f} seconds")
        log_path = f"{log_dir}/running_time_{drop_list_str}.txt"
        with open(log_path, "a") as f:
            f.write(f"Time taken: {end_time - start_time:.2f} seconds\n")

In [16]:
# Run backward selection with XGBoost
# Feature group names and sizes
feature_group_names = list(feature_groups.keys())

max_num_groups_to_drop = int(len(feature_group_names) / 2)
for num_groups_to_drop in range(1, max_num_groups_to_drop):
    print(f"Number of groups to drop: {num_groups_to_drop}")

    # Get all combinations of feature groups to drop
    for drop_indices in combinations(
        range(len(feature_group_names)), num_groups_to_drop
    ):
        drop_list = [feature_group_names[i] for i in drop_indices]
        drop_list_str = "_".join(drop_list)
        print(f"Dropping: {drop_list_str}")

        # Drop the selected feature groups
        X_train_dropped = drop_feature_groups(X_train, feature_groups, drop_list)
        X_val_dropped = drop_feature_groups(X_val, feature_groups, drop_list)
        X_test_dropped = drop_feature_groups(X_test, feature_groups, drop_list)

        # Train and evaluate the model with the dropped feature groups
        train_xgboost(
            X_train_dropped,
            y_train,
            X_val_dropped,
            y_val,
            X_test_dropped,
            y_test,
            drop_list_str,
        )

[299]	validation_0-auc:0.98314	validation_1-auc:0.77352	validation_2-auc:0.90115
Time taken: 114.17 seconds
Iteration 4
[0]	validation_0-auc:0.71988	validation_1-auc:0.66054	validation_2-auc:0.57699
[100]	validation_0-auc:0.94215	validation_1-auc:0.77260	validation_2-auc:0.88395
[200]	validation_0-auc:0.97019	validation_1-auc:0.77787	validation_2-auc:0.89245
[299]	validation_0-auc:0.98249	validation_1-auc:0.77827	validation_2-auc:0.90017
Time taken: 121.43 seconds
Dropping: MACCSFP_KRFP
Iteration 0
[0]	validation_0-auc:0.70641	validation_1-auc:0.68578	validation_2-auc:0.64427
[100]	validation_0-auc:0.94160	validation_1-auc:0.78163	validation_2-auc:0.87929
