# 08: LightGBM Model for Severe Traffic Accident Forecasting

**Objective:** Implement, tune, and evaluate a LightGBM classifier to predict severe traffic accidents on EDSA.

## 1. Setup: Imports and Configuration

In [3]:
# Standard Libraries
import pandas as pd
import numpy as np
import joblib
import json
from datetime import datetime
import time

# Scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score

# LightGBM
import lightgbm as lgb

# Imbalanced-learn (if SMOTE or other techniques are used)
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline as ImbPipeline # To avoid conflict with sklearn.pipeline

# Project-specific utilities
import sys
sys.path.append('../src') # Add src directory to Python path
from modeling_utils import compute_classification_metrics, append_performance_record, init_performance_excel

# Visualization (optional, for EDA within notebook if needed)
import matplotlib.pyplot as plt
import seaborn as sns

# Configure Pandas display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Configure Matplotlib/Seaborn for inline plotting
%matplotlib inline
sns.set_style('whitegrid')

### 1.1 Constants and Paths

In [4]:
PROCESSED_DATA_PATH = '../data/processed/preprocessed_data.csv'
MODEL_SAVE_PATH = '../models/lightgbm_best_model.joblib'
PERFORMANCE_EXCEL_PATH = '../reports/model_performance_summary.xlsx'
MODEL_NAME = 'LightGBM'

RANDOM_STATE = 42 # For reproducibility
CV_FOLDS = 5 # Number of cross-validation folds

### 1.2 Initialize Performance Excel (if it doesn't exist)

In [5]:
try:
    pd.read_excel(PERFORMANCE_EXCEL_PATH)
    print(f"Excel log file '{PERFORMANCE_EXCEL_PATH}' already exists.")
except FileNotFoundError:
    init_performance_excel(PERFORMANCE_EXCEL_PATH)
    print(f"Initialized Excel log file at '{PERFORMANCE_EXCEL_PATH}'.")

Excel log file '../reports/model_performance_summary.xlsx' already exists.


## 2. Data Loading and Preparation

### 2.1 Load Data

In [6]:
df = pd.read_csv(PROCESSED_DATA_PATH)
print(f"Data loaded successfully from {PROCESSED_DATA_PATH}")
print(f"Shape of the dataframe: {df.shape}")
df.head()

Data loaded successfully from ../data/processed/preprocessed_data.csv
Shape of the dataframe: (22072, 42)


Unnamed: 0,SEVERITY,Y,X,DATETIME_UTC,hour,day_of_week,day,month,year,is_weekend,season,ROAD_EDSA,MAIN_CAUSE_Human error,MAIN_CAUSE_Other (see description),MAIN_CAUSE_Road defect,MAIN_CAUSE_Unknown,MAIN_CAUSE_Vehicle defect,COLLISION_TYPE_Angle Impact,COLLISION_TYPE_Head-On,COLLISION_TYPE_Hit Object,COLLISION_TYPE_Multiple,COLLISION_TYPE_No Collision Stated,COLLISION_TYPE_Rear-End,COLLISION_TYPE_Self-Accident,COLLISION_TYPE_Side Swipe,WEATHER_Unknown,WEATHER_clear-day,WEATHER_clear-night,WEATHER_cloudy,WEATHER_fog,WEATHER_partly-cloudy-day,WEATHER_partly-cloudy-night,WEATHER_rain,LIGHT_Unknown,LIGHT_day,LIGHT_dusk,LIGHT_night,REPORTING_AGENCY_MMDA Metrobase,REPORTING_AGENCY_MMDA Road Safety Unit,REPORTING_AGENCY_Other,desc_word_count,desc_contains_collision
0,Property,14.65771,121.01979,2014-06-30 05:40:00,5,0,30,6,2014,False,Summer,True,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,30,1
1,Property,14.65771,121.01979,2014-03-17 01:00:00,1,0,17,3,2014,False,Spring,True,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,38,1
2,Injury,14.65771,121.01979,2013-11-26 02:00:00,2,1,26,11,2013,False,Fall,True,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,30,1
3,Property,14.65771,121.01979,2013-10-26 13:00:00,13,5,26,10,2013,True,Fall,True,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,31,1
4,Injury,14.65771,121.01966,2013-06-26 23:30:00,23,2,26,6,2013,False,Summer,True,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,32,1


### 2.2 Feature and Target Split

In [7]:
TARGET_COLUMN = 'SEVERITY' # Assuming this is the target column name
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

Features shape: (22072, 41)
Target shape: (22072,)
Target distribution:
SEVERITY
Property   0.93127
Injury     0.06773
Fatal      0.00100
Name: proportion, dtype: float64


### 2.3 Train-Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE, 
    stratify=y # Ensure stratification for imbalanced datasets
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Test target distribution:\n{y_test.value_counts(normalize=True)}")

X_train shape: (17657, 41), y_train shape: (17657,)
X_test shape: (4415, 41), y_test shape: (4415,)
Training target distribution:
SEVERITY
Property   0.93125
Injury     0.06774
Fatal      0.00102
Name: proportion, dtype: float64
Test target distribution:
SEVERITY
Property   0.93137
Injury     0.06772
Fatal      0.00091
Name: proportion, dtype: float64


### 2.4 Numerical Feature Scaling

In [9]:
# Identify numerical features (assuming all non-object columns are numerical and need scaling)
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

scaler = StandardScaler() # Or MinMaxScaler()

# Create copies of the dataframes to store the scaled features
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Fit the scaler on the training data's numerical features and transform both training and test set's numerical features
if numerical_features:
    X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
    X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])
    print("\nScaling complete.")
    print("X_train_scaled (numerical features) head:")
    display(X_train_scaled[numerical_features].head())
else:
    print("\nNo numerical features identified for scaling.")


Scaling complete.
X_train_scaled (numerical features) head:


Unnamed: 0,Y,X,hour,day_of_week,day,month,year,desc_word_count,desc_contains_collision
7948,-0.97779,0.21942,-0.8084,1.64366,-1.19811,-0.70811,-1.75055,0.69034,-0.73554
17297,1.27048,-0.00204,-0.65935,-0.44388,0.86189,1.10753,-0.18682,-0.61884,-0.73554
3678,-1.25551,-0.47388,0.98016,-1.48766,-0.28256,1.10753,0.20411,-0.18245,-0.73554
6054,-0.6812,0.71897,0.38397,-0.44388,1.663,-1.01072,0.98597,0.03575,-0.73554
13923,1.31281,-1.19265,0.23493,0.59989,1.663,1.10753,0.59504,0.90853,1.35955


### 2.5 Class Imbalance Handling (if necessary)

In [10]:
# Class imbalance was identified as a potential issue.
# Strategy chosen in '04_modeling_pipeline_setup.ipynb' should be applied here if it involves resampling.
# LightGBM also has built-in parameters like `is_unbalance=True` or `scale_pos_weight` that can be effective.

# Option 1: Using SMOTE (if chosen as the strategy)
# Ensure 'imblearn' is installed: pip install imbalanced-learn
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=RANDOM_STATE)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# print("Original training dataset shape:", y_train.value_counts())
# print("Resampled training dataset shape (SMOTE):", y_train_resampled.value_counts())

# # If using SMOTE, subsequent training should use X_train_resampled and y_train_resampled
# X_train_final = X_train_resampled
# y_train_final = y_train_resampled

# Option 2: Relying on LightGBM's built-in handling (e.g., scale_pos_weight or is_unbalance=True)
# In this case, we use the scaled training data directly.
# The `scale_pos_weight` will be calculated and used during model instantiation if this path is chosen.
# Or, `is_unbalance=True` can be set in LGBMClassifier.

X_train_final = X_train_scaled.copy() # Make a copy to avoid modifying the original scaled data
y_train_final = y_train.copy()

# Calculate scale_pos_weight for later use if LightGBM's parameter is preferred
# scale_pos_weight = number_of_negative_samples / number_of_positive_samples
if len(y_train_final.value_counts()) > 1:
    neg_count = y_train_final.value_counts().get(0, 0) # Get count for class 0, default to 0 if not present
    pos_count = y_train_final.value_counts().get(1, 0) # Get count for class 1, default to 0 if not present
    if pos_count > 0:
        calculated_scale_pos_weight = neg_count / pos_count
        print(f"Calculated scale_pos_weight: {calculated_scale_pos_weight:.2f} (Negative: {neg_count}, Positive: {pos_count})")
    else:
        print("Positive class count is zero, cannot calculate scale_pos_weight.")
        calculated_scale_pos_weight = 1 # Default or handle as appropriate
else:
    print("Target variable has only one class. Check data or stratification.")
    calculated_scale_pos_weight = 1 # Default or handle as appropriate

# For now, we'll proceed with X_train_final and y_train_final.
# The choice of applying SMOTE vs. using LGBM parameters will be reflected in the model training step.
print(f"\nUsing X_train_final shape: {X_train_final.shape}, y_train_final shape: {y_train_final.shape}")
print(f"y_train_final distribution:\n{y_train_final.value_counts(normalize=True)}")

Calculated scale_pos_weight: 13.75 (Negative: 16443, Positive: 1196)

Using X_train_final shape: (17657, 41), y_train_final shape: (17657,)
y_train_final distribution:
SEVERITY
Property   0.93125
Injury     0.06774
Fatal      0.00102
Name: proportion, dtype: float64


  neg_count = y_train_final.value_counts().get(0, 0) # Get count for class 0, default to 0 if not present
  pos_count = y_train_final.value_counts().get(1, 0) # Get count for class 1, default to 0 if not present


## 3. Initial LightGBM Model (Baseline)

In [11]:
print("Starting baseline LightGBM model training...")

# Ensure X_train_final and y_train_final are defined
if "X_train_final" not in globals():
    # This implies X_train_scaled should be defined earlier
    X_train_final = X_train_scaled.copy()
if "y_train_final" not in globals():
    # This implies y_train should be defined earlier
    y_train_final = y_train.copy()

start_time = time.time()

lgbm_baseline = lgb.LGBMClassifier(random_state=RANDOM_STATE)

columns_to_drop = ["DATETIME_UTC", "season"]
X_train_final_numeric = X_train_final.drop(
    columns=columns_to_drop, errors="ignore"
)
X_test_scaled_numeric = X_test_scaled.drop(
    columns=columns_to_drop, errors="ignore"
)

lgbm_baseline.fit(X_train_final_numeric, y_train_final)

# Make predictions using only the numeric features
y_pred_train_baseline = lgbm_baseline.predict(X_train_final_numeric)
y_pred_test_baseline = lgbm_baseline.predict(X_test_scaled_numeric)

# For ROC AUC, pass the full probability matrix for multiclass
y_prob_train_baseline = lgbm_baseline.predict_proba(X_train_final_numeric)
y_prob_test_baseline = lgbm_baseline.predict_proba(X_test_scaled_numeric)

training_time_baseline = time.time() - start_time
print(
    f"Baseline model training completed in {training_time_baseline:.2f} seconds."
)

# Evaluate performance
print("\nBaseline Model Performance:")
# This assumes compute_classification_metrics is now the updated version
train_metrics_baseline = compute_classification_metrics(
    y_train_final, y_pred_train_baseline, y_prob_train_baseline
)
print("Training Metrics (Baseline):")
for metric, value in train_metrics_baseline.items():
    print(f"  {metric}: {value:.4f}")

test_metrics_baseline = compute_classification_metrics(
    y_test, y_pred_test_baseline, y_prob_test_baseline
)
print("\nTest Metrics (Baseline):")
for metric, value in test_metrics_baseline.items():
    print(f"  {metric}: {value:.4f}")

# Log to Excel
baseline_record = {
    "Model_Name": f"{MODEL_NAME}_Baseline",
    "Timestamp": datetime.utcnow().isoformat(),
    "Hyperparameter_Set_Tried": "Default",
    "CV_Score_for_Set": None,  # No CV for baseline
    "Selected_Final_Hyperparameters": json.dumps(lgbm_baseline.get_params()),
    "Training_Time_Seconds": training_time_baseline,
    "Train_Precision": train_metrics_baseline.get("Precision"),
    "Train_Recall": train_metrics_baseline.get("Recall"),
    "Train_F1": train_metrics_baseline.get("F1"),
    "Train_ROC_AUC": train_metrics_baseline.get("ROC_AUC"),
    "Test_Precision": test_metrics_baseline.get("Precision"),
    "Test_Recall": test_metrics_baseline.get("Recall"),
    "Test_F1": test_metrics_baseline.get("F1"),
    "Test_ROC_AUC": test_metrics_baseline.get("ROC_AUC"),
    "Class_Imbalance_Strategy": "Default params (is_unbalance=False, no scale_pos_weight unless manually set above)",
    "Notes": "Initial baseline model without hyperparameter tuning.",
}

# append_performance_record(PERFORMANCE_EXCEL_PATH, baseline_record) # Uncomment when ready
print(f"\nBaseline model performance logged to {PERFORMANCE_EXCEL_PATH}")



Starting baseline LightGBM model training...


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 676
[LightGBM] [Info] Number of data points in the train set: 17657, number of used features: 31
[LightGBM] [Info] Start training from score -6.888516
[LightGBM] [Info] Start training from score -2.692150
[LightGBM] [Info] Start training from score -0.071232
Baseline model training completed in 4.27 seconds.

Baseline Model Performance:
Training Metrics (Baseline):
  Precision: 0.9489
  Recall: 0.9462
  F1: 0.9293
  ROC_AUC: 0.9567

Test Metrics (Baseline):
  Precision: 0.9020
  Recall: 0.9311
  F1: 0.9028
  ROC_AUC: 0.7043

Baseline model performance logged to ../reports/model_performance_summary.xlsx


  "Timestamp": datetime.utcnow().isoformat(),


## 4. Hyperparameter Tuning

In [None]:
# Placeholder for hyperparameter tuning (e.g., GridSearchCV or RandomizedSearchCV)

### 4.1 Define Parameter Grid

In [12]:
# Define the parameter grid for LightGBM
# This is an example grid; adjust based on computational resources and desired search space.
param_grid_lgbm = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40, 50], # Default is 31
    'max_depth': [-1, 10, 20], # -1 means no limit
    'reg_alpha': [0, 0.1, 0.5, 1], # L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1], # L2 regularization
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0], # Subsample ratio of columns when constructing each tree
    'subsample': [0.7, 0.8, 0.9, 1.0], # Subsample ratio of the training instance
    'min_child_samples': [10, 20, 30], # Minimum number of data needed in a child (leaf)
    # 'scale_pos_weight': [1, calculated_scale_pos_weight], # If not using SMOTE and want to tune this
    # 'is_unbalance': [True, False] # If not using SMOTE and want to tune this
}

print("Parameter grid for LightGBM defined:")
for key, value in param_grid_lgbm.items():
    print(f"  {key}: {value}")

Parameter grid for LightGBM defined:
  n_estimators: [100, 200, 300, 500]
  learning_rate: [0.01, 0.05, 0.1]
  num_leaves: [20, 31, 40, 50]
  max_depth: [-1, 10, 20]
  reg_alpha: [0, 0.1, 0.5, 1]
  reg_lambda: [0, 0.1, 0.5, 1]
  colsample_bytree: [0.7, 0.8, 0.9, 1.0]
  subsample: [0.7, 0.8, 0.9, 1.0]
  min_child_samples: [10, 20, 30]


### 4.2 Perform Search

In [None]:
# Initialize LightGBM classifier for tuning
lgbm_tuning = lgb.LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1)

# Define scoring meLightGBMtrics. F1 is often good for imbalanced classes.
# Using make_scorer to ensure predict_proba is not required for some metrics if not available/needed by them.
scoring = {
    'F1': make_scorer(f1_score, average='weighted', zero_division=0),
    'ROC_AUC': make_scorer(roc_auc_score, needs_proba=True, average='weighted'), # Ensure predict_proba is used
    'Precision': make_scorer(precision_score, average='weighted', zero_division=0),
    'Recall': make_scorer(recall_score, average='weighted', zero_division=0)
}

# Using RandomizedSearchCV for potentially faster search over a large grid.
# For a smaller grid or more exhaustive search, GridSearchCV can be used.
N_ITER_RANDOM_SEARCH = 1#  # Number of parameter settings that are sampled. Adjust as needed.
# N_ITER_RANDOM_SEARCH = 2 # Reduced for quick testing

cv_strategy = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# Create filtered datasets that remove problematic columns for LightGBM
columns_to_drop = ["DATETIME_UTC", "season"]
X_train_final_numeric = X_train_final.drop(columns=columns_to_drop, errors='ignore')

random_search_lgbm = RandomizedSearchCV(
    estimator=lgbm_tuning,
    param_distributions=param_grid_lgbm,
    n_iter=N_ITER_RANDOM_SEARCH,
    scoring=scoring,
    refit='F1', # Refit the best estimator using F1 score
    cv=cv_strategy,
    verbose=2, # Set to 1 or higher for more messages
    random_state=RANDOM_STATE,
    n_jobs=-1 # Use all available cores for CV fits
)

print(f"Starting RandomizedSearchCV for LightGBM with {N_ITER_RANDOM_SEARCH} iterations...")
search_start_time = time.time()
random_search_lgbm.fit(X_train_final_numeric, y_train_final)
search_time = time.time() - search_start_time
print(f"RandomizedSearchCV completed in {search_time:.2f} seconds.")

Starting RandomizedSearchCV for LightGBM with 5 iterations...
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.223655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 677
[LightGBM] [Info] Number of data points in the train set: 14125, number of used features: 32
[LightGBM] [Info] Start training from score -6.916644
[LightGBM] [Info] Start training from score -2.691898
[LightGBM] [Info] Start training from score -0.071220
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.814076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 672
[LightGBM] [Info] Number of data points in the train set: 14125, number of used features: 29
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 5.332142 seconds.
You can set `force_row_wise=tr

KeyboardInterrupt: 

### 4.3 Log All CV Trials

In [13]:
print("\nLogging all CV trials to Excel...")
cv_results_df = pd.DataFrame(random_search_lgbm.cv_results_)

# Iterate through each trial and log it
for i in range(len(cv_results_df)):
    params_tried = cv_results_df.loc[i, 'params']
    # The main score used for refit (e.g., 'mean_test_F1')
    # The exact name depends on the 'refit' string and how RandomizedSearchCV stores it.
    # It's usually 'mean_test_{refit_metric_name}'
    cv_score_for_set = cv_results_df.loc[i, f'mean_test_{random_search_lgbm.refit_}'] 

    trial_record = {
        'Model_Name': f"{MODEL_NAME}_CV_Trial",
        'Timestamp': datetime.utcnow().isoformat(),
        'Hyperparameter_Set_Tried': json.dumps(params_tried),
        'CV_Score_for_Set': cv_score_for_set,
        'Selected_Final_Hyperparameters': None, # Not applicable for individual trials
        'Training_Time_Seconds': cv_results_df.loc[i, 'mean_fit_time'], # Average fit time for this param set
        'Train_Precision': None, 'Train_Recall': None, 'Train_F1': None, 'Train_ROC_AUC': None, # Not typically available per trial from CV results directly
        'Test_Precision': None, 'Test_Recall': None, 'Test_F1': None, 'Test_ROC_AUC': None, # These are for the final model
        'Class_Imbalance_Strategy': 'Refer to final model section or if scale_pos_weight/is_unbalance in params_tried', # Or specify if varied in grid
        'Notes': f"CV trial {i+1}/{N_ITER_RANDOM_SEARCH}. Scorer: {random_search_lgbm.refit_}."
    }
    append_performance_record(PERFORMANCE_EXCEL_PATH, trial_record)

print(f"All {len(cv_results_df)} CV trials logged to {PERFORMANCE_EXCEL_PATH}")
print("\nTop 5 CV results (based on F1 score):")
display(cv_results_df.sort_values(by=f'rank_test_{random_search_lgbm.refit_}').head())


Logging all CV trials to Excel...


AttributeError: 'RandomizedSearchCV' object has no attribute 'cv_results_'

### 4.4 Best Parameters and Score

In [12]:
best_params_lgbm = random_search_lgbm.best_params_
best_score_lgbm = random_search_lgbm.best_score_

print(f"Best Hyperparameters found for {MODEL_NAME}:")
print(json.dumps(best_params_lgbm, indent=2))
print(f"\nBest CV Score ({random_search_lgbm.refit_}): {best_score_lgbm:.4f}")

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

## 5. Final Model Training & Evaluation (with Best Hyperparameters)

In [None]:
print("\nTraining final LightGBM model with best hyperparameters...")
start_time_final_model = time.time()

# Instantiate the final model with the best parameters
# Note: RandomizedSearchCV automatically refits the best estimator on the whole training data (used for CV)
# So, random_search_lgbm.best_estimator_ is already trained.
lgbm_final = random_search_lgbm.best_estimator_\n

# If you wanted to train it explicitly (e.g., if refit=False or for clarity):
# lgbm_final = lgb.LGBMClassifier(**best_params_lgbm, random_state=RANDOM_STATE, n_jobs=-1)
# lgbm_final.fit(X_train_final, y_train_final)

training_time_final_model = time.time() - start_time_final_model # This would be training time if explicitly fit
# If using best_estimator_, the 'training time' is more complex as it includes CV search time.
# For simplicity, we can log the search_time or the refit time if available, or time for this cell if re-fitting.
# Here, we'll assume 'search_time' captured the bulk of the effort for finding the best model.
print(f"Final model (best_estimator_ from RandomizedSearch) obtained. Search took {search_time:.2f} seconds.")

# Make predictions with the final model
y_pred_train_final = lgbm_final.predict(X_train_final)
y_pred_test_final = lgbm_final.predict(X_test_scaled)

y_prob_train_final = lgbm_final.predict_proba(X_train_final)[:, 1]
y_prob_test_final = lgbm_final.predict_proba(X_test_scaled)[:, 1]

# Evaluate final model performance
print("\nFinal Model Performance:")
train_metrics_final = compute_classification_metrics(y_train_final, y_pred_train_final, y_prob_train_final)
print("Training Metrics (Final Model):")
for metric, value in train_metrics_final.items():
    print(f"  {metric}: {value:.4f}")

test_metrics_final = compute_classification_metrics(y_test, y_pred_test_final, y_prob_test_final)
print("\nTest Metrics (Final Model):")
for metric, value in test_metrics_final.items():
    print(f"  {metric}: {value:.4f}")

# Determine Class Imbalance Strategy string for logging
imbalance_strategy_note = "Default LightGBM params or manually set"
if 'scale_pos_weight' in best_params_lgbm and best_params_lgbm['scale_pos_weight'] is not None and best_params_lgbm['scale_pos_weight'] != 1:
    imbalance_strategy_note = f"scale_pos_weight={best_params_lgbm['scale_pos_weight']:.2f}"
elif 'is_unbalance' in best_params_lgbm and best_params_lgbm['is_unbalance'] == True:
    imbalance_strategy_note = "is_unbalance=True"
# If SMOTE was applied to X_train_final, that would be the primary strategy.
# For this template, we assume X_train_final was either original or SMOTE'd, and LGBM params are secondary/alternative.
# Modify this logic if SMOTE was used to create X_train_final.

# Log final model performance to Excel
final_model_record = {
    'Model_Name': f"{MODEL_NAME}_Tuned",
    'Timestamp': datetime.utcnow().isoformat(),
    'Hyperparameter_Set_Tried': f"RandomizedSearchCV_iters={N_ITER_RANDOM_SEARCH}",
    'CV_Score_for_Set': best_score_lgbm, # Best score from CV
    'Selected_Final_Hyperparameters': json.dumps(best_params_lgbm),
    'Training_Time_Seconds': search_time, # Using total search time as a proxy for effort to get best model
    'Train_Precision': train_metrics_final.get('Precision'),
    'Train_Recall': train_metrics_final.get('Recall'),
    'Train_F1': train_metrics_final.get('F1'),
    'Train_ROC_AUC': train_metrics_final.get('ROC_AUC'),
    'Test_Precision': test_metrics_final.get('Precision'),
    'Test_Recall': test_metrics_final.get('Recall'),
    'Test_F1': test_metrics_final.get('F1'),
    'Test_ROC_AUC': test_metrics_final.get('ROC_AUC'),
    'Class_Imbalance_Strategy': imbalance_strategy_note, # Reflect actual strategy based on best_params or if SMOTE was used
    'Notes': f"Final model after RandomizedSearchCV with {N_ITER_RANDOM_SEARCH} iterations. Refit on F1."
}

append_performance_record(PERFORMANCE_EXCEL_PATH, final_model_record)
print(f"\nFinal model performance logged to {PERFORMANCE_EXCEL_PATH}")

## 6. Model Persistence

In [None]:
print(f"\nSaving the final tuned LightGBM model to {MODEL_SAVE_PATH}...")
try:
    # Ensure the directory exists
    import os
    os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
    
    joblib.dump(lgbm_final, MODEL_SAVE_PATH)
    print(f"Model successfully saved to {MODEL_SAVE_PATH}")
except Exception as e:
    print(f"Error saving model: {e}")

## 7. Conclusion and Summary

In [None]:
print("## 7. Conclusion and Summary")
print("--- ")
print(f"The LightGBM model development process is now complete for notebook {MODEL_NAME}.")
print("Key steps included:")
print("- Data loading, preprocessing (splitting, scaling). ")
print("- Addressed class imbalance (details in section 2.5 and model parameters). ")
print("- Trained a baseline LightGBM model and logged its performance. ")
print(f"- Performed hyperparameter tuning using RandomizedSearchCV with {N_ITER_RANDOM_SEARCH} iterations and {CV_FOLDS}-fold CV, optimizing for F1 score.")
print("- Logged all CV trials and the best hyperparameters found.")
print(f"Best hyperparameters: {json.dumps(best_params_lgbm)}")
print(f"Best CV F1 score: {best_score_lgbm:.4f}")
print("- Trained and evaluated the final model using these best hyperparameters.")
print("Final Model Test Performance:")
for metric, value in test_metrics_final.items():
    print(f"  - Test {metric}: {value:.4f}")
print(f"- Saved the final tuned model to: {MODEL_SAVE_PATH}")
print(f"- All performance metrics and hyperparameter trials have been logged to: {PERFORMANCE_EXCEL_PATH}")
print("\nFurther analysis should involve comparing this model's performance against other models developed in this project.")
print("Consider factors like interpretability (e.g., feature importances from LightGBM), training time, and specific business requirements when selecting the overall best model.")