In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install -q pandas numpy matplotlib seaborn scikit-learn imbalanced-learn xgboost lightgbm catboost shap optuna joblib


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import pandas as pd
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from google.colab import drive

drive.mount('/content/drive')
DATA_PATH = "/content/drive/MyDrive/diabetes_data"

data = np.load(f"{DATA_PATH}/train_test_split.npz")
X_train, y_train = data["X_train"], data["y_train"]

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

with open(f"{DATA_PATH}/feature_names.pkl", "rb") as f:
    feature_names = joblib.load(f)

model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [4, 6],
    "learning_rate": [0.05, 0.1]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f"Test MSE: {mse:.4f}, Test R²: {r2:.4f}")

booster = best_model.get_booster()
booster.save_model(f"{DATA_PATH}/diabetes_model.xgb")

explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)

plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, features=X_test, feature_names=feature_names, show=False)
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/shap_summary.png")
plt.close()

waterfall_fig = shap.plots.waterfall(shap_values[0], show=False)
waterfall_fig.figure.savefig(f"{DATA_PATH}/shap_case_detail.png")
plt.close()

joblib.dump(explainer, f"{DATA_PATH}/shap_explainer.joblib")

residuals = y_test - y_pred
plt.figure(figsize=(8,5))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Predicted')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/residuals_plot.png", dpi=300)
plt.close()

plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title('Predicted vs Actual Values')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/predicted_vs_actual_plot.png", dpi=300)
plt.close()

importances = best_model.feature_importances_
feat_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

plt.figure(figsize=(10,6))
sns.barplot(data=feat_importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importances')
plt.tight_layout()
plt.savefig(f"{DATA_PATH}/feature_importance_plot.png", dpi=300)
plt.close()

report_html = f"""
<h1>Diabetes Progression Model Report</h1>
<h2>Model: XGBoost Regressor</h2>
<ul>
  <li><b>Best Hyperparameters:</b> {grid_search.best_params_}</li>
</ul>
<h2>Metrics</h2>
<ul>
  <li><b>Mean Squared Error (MSE):</b> {mse:.4f}</li>
  <li><b>Root Mean Squared Error (RMSE):</b> {rmse:.4f}</li>
  <li><b>Mean Absolute Error (MAE):</b> {mae:.4f}</li>
  <li><b>R² Score:</b> {r2:.4f}</li>
</ul>
<h2>Important Figures</h2>
<ul>
  <li><a href="./shap_summary.png">SHAP Summary Plot</a></li>
  <li><a href="./shap_case_detail.png">SHAP Waterfall Example</a></li>
  <li><a href="./feature_importance_plot.png">Feature Importance Plot</a></li>
  <li><a href="./residuals_plot.png">Residuals vs Predicted Plot</a></li>
  <li><a href="./predicted_vs_actual_plot.png">Predicted vs Actual Plot</a></li>
</ul>
"""

with open(f"{DATA_PATH}/model_report.html", "w") as f:
    f.write(report_html)

print("Model + SHAP + Plots saved to Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}
Test MSE: 0.0435, Test R²: 0.9118



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=feat_importance_df, x='Importance', y='Feature', palette='viridis')


Model + SHAP + Plots saved to Drive.
