In [9]:
import os
import numpy as np
import pandas as pd
from openpyxl import load_workbook
from src.Utils.K_Fold import K_Fold
from src.data_loader import load_data
from src.model.train_model import get_X_y,train_model
from src.analysis.SHAP import shap_analysis
from objective_function import objective_stacking
from src.analysis.LIME import lime_sensitivity_analysis
from src.Optimiser.HOA.hoa_optimizer import hoa_optimizer
from src.Utils.balance_dataset_smote_enc import balance_with_smote_enc

In [10]:
DATA_PATH = (
    r"D:\ML\B(SMOTE-ENC)#K-FOLD3#M(XGBC&RFC)#O(HOA)#A(SHAP[PLOTS])\data\data.xlsx"
)
data = load_data(DATA_PATH)
# Load your data (make sure to implement or import load_data function)

target_column = "Stress Level "
categorical_features = [1]  # 'Gender' column index in feature set

In [11]:
balanced_df = balance_with_smote_enc(
    file_path=r"D:\ML\B(SMOTE-ENC)#K-FOLD3#M(XGBC&RFC)#O(HOA)#A(SHAP[PLOTS])\data\Dataset-10MO-Yahyavi (Student Stress Level).csv",
    target_col="Stress Level ",
    categorical_cols=["Gender"],
    output_path="balanced_health_data.csv",
)

In [12]:

# Encode Gender: let's map Male=0, Other=1
balanced_df["Gender"] = balanced_df["Gender"].map({"Male": 0, "Female": 1,"Other":2})

balanced_df = balanced_df.round(1)


book = load_workbook(DATA_PATH)
if "Data after SMOTE-ENC" in book.sheetnames:
    book.remove(book["Data after SMOTE-ENC"])
    book.save(DATA_PATH)
with pd.ExcelWriter(DATA_PATH, engine="openpyxl", mode="a") as writer:
    balanced_df.to_excel(writer, sheet_name="Data after SMOTE-ENC", index=False)


    

In [13]:
X, y = get_X_y(balanced_df, target_col="Stress Level ")

In [14]:
# Apply K-Fold cross-validation
(
    X_train,
    X_test,
    y_train,
    y_test,
    K_Fold_Cross_Validation_Scores,
    combined_df,
) = K_Fold(X, y, n_splits=5)

K_Fold_Cross_Validation_Scores = pd.DataFrame(K_Fold_Cross_Validation_Scores)



🔁 Fold 1 ------------------
  ADAboost → R2: -0.0960, RMSE: 9.9310

🔁 Fold 2 ------------------
  ADAboost → R2: 0.1961, RMSE: 5.4182

🔁 Fold 3 ------------------
  ADAboost → R2: 0.1265, RMSE: 4.8620

🔁 Fold 4 ------------------
  ADAboost → R2: 0.0799, RMSE: 6.6693

🔁 Fold 5 ------------------
  ADAboost → R2: -3.0267, RMSE: 9.5069

✅ K-Fold Cross-Validation completed.
Best fold index: 1, R2: 0.1961
✅ Combined DataFrame using original target column name:


In [15]:
# Save combined K-Fold data to Excel
book = load_workbook(DATA_PATH)
if "DATA after K-Fold" in book.sheetnames:
    book.remove(book["DATA after K-Fold"])
    book.save(DATA_PATH)
with pd.ExcelWriter(DATA_PATH, engine="openpyxl", mode="a") as writer:
    combined_df.to_excel(writer, sheet_name="DATA after K-Fold", index=False)

In [16]:

singleModel_result = train_model(X_train, y_train, X_test, y_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
best_pos, best_RMSE, convergence = hoa_optimizer(
    objective_stacking,  # our AdaBoost objective
    [50, 0.01],  # lower bounds: n_estimators, learning_rate
    [300, 1.0],  # upper bounds
    2,  # dim
    1,  # n_agents
    3,  # max_iter
    X_train,
    y_train,
    X_test,
    y_test,
)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


🥾 Iter 1/3 - Best RMSE: 4.17047


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


🥾 Iter 2/3 - Best RMSE: 4.17047
🥾 Iter 3/3 - Best RMSE: 4.12311


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
HOA_model_result = train_model(
    X_train, y_train, X_test, y_test, best_pos
)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# SHAP on the HOA model
sensitivity_df_shap, shap_values = shap_analysis(
    model=HOA_model_result["model"],  # assuming train_model returns dict with "model"
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    save_path=DATA_PATH,  # save to same Excel file
    sheet_name="SHAP_Sensitivity",
)



  0%|          | 0/28 [00:00<?, ?it/s]



In [22]:
sensitivity_LIME = lime_sensitivity_analysis(
    model=HOA_model_result["model"],
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    sample_index=5,
    epsilon=0.05,
)





📊 LIME Sensitivity for each feature:
Heart Rate (bpm) > 90.00: 0.0324
Overall Health Score > 93.00: 0.2055
Age <= 16.00: 0.0209
1.95 < Hydration Level (liters) <= 2.40: 0.0892
36.65 < Body Temperature (°C) <= 37.00: 0.1059
4.40 < Physical Activity Level (METs) <= 5.80: 0.0413
Sleep Duration (hours) <= 5.88: 0.1630
0.00 < Gender <= 1.00: 0.0633
Blood Oxygen Level (%) > 98.93: 0.0475


In [23]:





print("\nBest AdaBoost Params:", best_pos)
print("Best RMSE:", best_RMSE)


Best AdaBoost Params: [171.726028     0.58675449]
Best RMSE: 4.123105625617661
