In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from save_and_compare_results import *
import shap

In [None]:
X = pd.read_pickle("X_matrix_proteins_nb.pkl")
Y = pd.read_pickle("Y_matrix_proteins_nb.pkl")

In [None]:
# Preprocess the data
X = X.drop(columns=["Yeast_ID"]).fillna(0)
Y = Y.drop(columns=["Yeast_ID"]).fillna(Y.drop(columns=["Yeast_ID"]).mean())

In [None]:
# Define hyperparameters for GridSearchCV
param_grid = {
    "learning_rate": [0.01, 0.1, 0.2],  # Learning rate
    "n_estimators": [100, 300, 500],    # Number of boosting rounds
    "max_depth": [3, 6, 10],            # Maximum tree depth
    "subsample": [0.6, 0.8, 1.0],       # Subsample ratio of training data
    "colsample_bytree": [0.6, 0.8, 1.0], # Subsample ratio of features
    "reg_alpha": [0, 0.1, 1],           # L1 regularization term
    "reg_lambda": [1, 10],              # L2 regularization term
}

# Initialize the model
xgb = XGBRegressor(random_state=42, objective="reg:squarederror", verbosity=0)

# Configure GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [None]:
print("Training the XGBoost model with hyperparameter tuning...")
grid_search.fit(X, Y.values.ravel())

In [None]:
best_xgb = grid_search.best_estimator_
best_params = grid_search.best_params_
print("\nBest hyperparameters:", best_params)

In [None]:
y_pred = best_xgb.predict(X)
r2 = r2_score(Y, y_pred)
mse = mean_squared_error(Y, y_pred)
print(f"\nR² Score: {r2:.4f}, Mean Squared Error: {mse:.4f}")

# Model features importance

In [None]:
# Save feature importance from XBG
print("Saving XBG feature importances...")
save_feature_importance(
    features=X.columns,
    importance_scores=best_xgb.feature_importances_,
    method="model",
    model_name="XGBoost"
)

# Display and plot the top 10 features
print("Displaying top 10 features...")
feature_importances_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_xgb.feature_importances_
}).sort_values(by="Importance", ascending=False)

top_features = feature_importances_df.head(10)

print("\nTop 10 Features Impacting YPD Doubling Time:")
print(top_features)


In [None]:
plt.figure(figsize=(10, 6))
plt.barh(top_features["Feature"], top_features["Importance"], color="skyblue")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Top 10 Features (XGB)")
plt.gca().invert_yaxis()
plt.show()

# SHAP features importance

In [None]:
# SHAP analysis
print("Calculating SHAP values...")
explainer = shap.Explainer(best_xgb, X)
shap_values = explainer(X)
shap_mean_importance = np.abs(shap_values.values).mean(axis=0)

# Save SHAP importance scores
print("Saving SHAP importance scores for XGBoost...")
save_feature_importance(
    features=X.columns,
    importance_scores=shap_mean_importance,
    method="SHAP",
    model_name="XGBoost"
)

# %%
# SHAP plots
print("Generating SHAP plots...")
shap.summary_plot(shap_values, X, plot_type="bar")
shap.summary_plot(shap_values, X)