In [17]:
import os
import numpy as np
import pandas as pd
from scipy.stats import probplot
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, ExtraTreesRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
#from pandas_profiling import ProfileReport
from pathlib import Path
from sklearn.metrics import mean_squared_error
from scipy import stats
from yellowbrick.model_selection import FeatureImportances
import shap
import joblib

import optuna
import mlflow
%matplotlib inline

from **4a_regression_train.ipynb** we know: 

### ** Top 5  Models **

| Rank | Model | Mean Score | Standard Deviation |
|:---|:---|:---|:---|
| 1 | **Gradient Boosting Regression** | 3871.92 | 45.50 |
| 2 | **Random Forest Regression** | 4026.38 | 77.62 |
| 3 | **Extra Trees Regression** | 4090.38 | 61.13 |
| 4 | **XG Boost** | 4130.37 | 37.57 |
| 5 | **Bagging Regression** | 4207.16 | 109.43 |

In [18]:
train_original = pd.read_csv(r'C:\Users\user\Desktop\ML & DL projects\2 Stage loan system\data\feature_engineered\train_regression_data.csv')
test_original = pd.read_csv(r'C:\Users\user\Desktop\ML & DL projects\2 Stage loan system\data\feature_engineered\test_regression_data.csv')

In [19]:
X_train = train_original.drop(columns='Loan Sanction Amount (USD)', axis = 1)
y_train = train_original['Loan Sanction Amount (USD)']


X_test = test_original.drop(columns='Loan Sanction Amount (USD)', axis = 1)
y_test = test_original['Loan Sanction Amount (USD)']

### Fine Tuning the models

In [20]:
def objective(trial):
    # 1. Let Optuna pick which model to tune in this trial
    model_type = trial.suggest_categorical("model_type", ["GradientBoosting", "RandomForest", "ExtraTrees", "XGBoost"])
    
    with mlflow.start_run(nested=True):
        mlflow.log_param("model_type", model_type)
        
        if model_type == "GradientBoosting":
            params = {
                "n_estimators": trial.suggest_int("gb_n_estimators", 100, 1000),
                "learning_rate": trial.suggest_float("gb_lr", 0.01, 0.2, log=True),
                "max_depth": trial.suggest_int("gb_max_depth", 3, 10),
                "subsample": trial.suggest_float("gb_subsample", 0.7, 1.0),
                "random_state": 42
            }
            model = GradientBoostingRegressor(**params)

        elif model_type == "RandomForest":
            params = {
                "n_estimators": trial.suggest_int("rf_n_estimators", 100, 1000),
                "max_depth": trial.suggest_int("rf_max_depth", 10, 50),
                "min_samples_split": trial.suggest_int("rf_split", 2, 10),
                "max_features": trial.suggest_categorical("rf_feat", ["sqrt", "log2", None]),
                "random_state": 42,
                "n_jobs": -1
            }
            model = RandomForestRegressor(**params)

        elif model_type == "ExtraTrees":
            params = {
                "n_estimators": trial.suggest_int("et_n_estimators", 100, 1000),
                "max_depth": trial.suggest_int("et_max_depth", 10, 50),
                "min_samples_split": trial.suggest_int("et_split", 2, 10),
                "random_state": 42,
                "n_jobs": -1
            }
            model = ExtraTreesRegressor(**params)

        elif model_type == "XGBoost":
            params = {
                "n_estimators": trial.suggest_int("xgb_n_estimators", 100, 1000),
                "max_depth": trial.suggest_int("xgb_max_depth", 3, 10),
                "learning_rate": trial.suggest_float("xgb_lr", 0.01, 0.2, log=True),
                "lambda": trial.suggest_float("xgb_lambda", 1e-3, 10.0, log=True),
                "alpha": trial.suggest_float("xgb_alpha", 1e-3, 10.0, log=True),
                "random_state": 42,
                "tree_method": "hist"
            }
            model = xgb.XGBRegressor(**params)

        # 2. Train and Evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae = float(mean_absolute_error(y_test, y_pred))
        r2 = float(r2_score(y_test, y_pred))

        # 3. Log results
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

        return rmse

In [21]:
# 1. Set the tracking URI to your specific project folder
mlflow.set_tracking_uri(r"file:///C:\Users\user\Desktop\ML & DL projects\2 Stage loan system\mlruns")

# 2. Rename the experiment to reflect the Top 4 model competition
mlflow.set_experiment("top_4_models_optimization")

# 3. Create and run the study
# We use 'minimize' because we are returning RMSE from the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # Increased trials to give each model a fair chance

# 4. Results Interpretation
print("="*30)
print("OPTIMIZATION COMPLETE")
print("="*30)
print(f"Best Model Type: {study.best_params['model_type']}")
print(f"Best RMSE Score: {study.best_value:.4f}")
print("-" * 30)
print("Best Hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

[I 2026-02-02 01:39:09,780] A new study created in memory with name: no-name-b7f2c6a6-2d6f-42f9-b43e-42d2e9838e0d
[I 2026-02-02 01:39:32,347] Trial 0 finished with value: 10757.801486288949 and parameters: {'model_type': 'GradientBoosting', 'gb_n_estimators': 810, 'gb_lr': 0.08587756561441867, 'gb_max_depth': 5, 'gb_subsample': 0.8319273719964079}. Best is trial 0 with value: 10757.801486288949.
[I 2026-02-02 01:39:32,646] Trial 1 finished with value: 10948.204509609302 and parameters: {'model_type': 'ExtraTrees', 'et_n_estimators': 153, 'et_max_depth': 10, 'et_split': 7}. Best is trial 0 with value: 10757.801486288949.
[I 2026-02-02 01:39:53,746] Trial 2 finished with value: 10483.918086285425 and parameters: {'model_type': 'GradientBoosting', 'gb_n_estimators': 926, 'gb_lr': 0.03476662207269958, 'gb_max_depth': 4, 'gb_subsample': 0.8356773422314203}. Best is trial 2 with value: 10483.918086285425.
[I 2026-02-02 01:39:54,732] Trial 3 finished with value: 10787.188709371883 and paramet

OPTIMIZATION COMPLETE
Best Model Type: GradientBoosting
Best RMSE Score: 10299.3614
------------------------------
Best Hyperparameters:
  model_type: GradientBoosting
  gb_n_estimators: 630
  gb_lr: 0.016833112938731944
  gb_max_depth: 4
  gb_subsample: 0.9916956598760839


In [22]:
# 1. Retrieve best parameters
best_params = study.best_params.copy()
model_type = best_params.pop("model_type")

# 2. Dynamic Initialization with Name Mapping
match model_type:
    case "GradientBoosting":
        # Map Optuna keys to official sklearn names
        mapping = {"gb_lr": "learning_rate", "gb_n_estimators": "n_estimators", 
                   "gb_max_depth": "max_depth", "gb_subsample": "subsample"}
        clean_params = {mapping.get(k, k): v for k, v in best_params.items()}
        best_model = GradientBoostingRegressor(**clean_params)
        
    case "RandomForest":
        mapping = {"rf_n_estimators": "n_estimators", "rf_max_depth": "max_depth", 
                   "rf_split": "min_samples_split", "rf_feat": "max_features"}
        clean_params = {mapping.get(k, k): v for k, v in best_params.items()}
        best_model = RandomForestRegressor(**clean_params, n_jobs=-1)
        
    case "ExtraTrees":
        mapping = {"et_n_estimators": "n_estimators", "et_max_depth": "max_depth", 
                   "et_split": "min_samples_split"}
        clean_params = {mapping.get(k, k): v for k, v in best_params.items()}
        best_model = ExtraTreesRegressor(**clean_params, n_jobs=-1)
        
    case "XGBoost":
        mapping = {"xgb_lr": "learning_rate", "xgb_n_estimators": "n_estimators", 
                   "xgb_max_depth": "max_depth", "xgb_lambda": "reg_lambda", "xgb_alpha": "reg_alpha"}
        clean_params = {mapping.get(k, k): v for k, v in best_params.items()}
        best_model = xgb.XGBRegressor(**clean_params)
        
    case _:
        raise ValueError(f"Unknown model type: {model_type}")

# 3. Train and 4. Evaluate (rest of your code remains the same)
best_model.fit(X_train, y_train)
# 4. Evaluate on the evaluation set
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Final Tuned Model ({model_type}) Performance:")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# 5. Log the final run to MLflow
with mlflow.start_run(run_name=f"best_{model_type.lower()}_final"):
    mlflow.log_params(clean_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    
    # Use the appropriate flavor for logging
    if model_type == "XGBoost":
        mlflow.xgboost.log_model(best_model, artifact_path="model")
    else:
        mlflow.sklearn.log_model(best_model, artifact_path="model")




Final Tuned Model (GradientBoosting) Performance:
MAE: 5836.3319
RMSE: 10311.0137
R²: 0.9166


In [23]:
# 6. Save the model locally as a backup
joblib.dump(best_model, f"best_{model_type.lower()}_model.pkl")
print(f"Model saved locally as best_{model_type.lower()}_model.pkl")

Model saved locally as best_gradientboosting_model.pkl
