In [1]:
import os
import sys 
import pickle 
import joblib
import pandas as pd 
import numpy as np 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score, classification_report

if '../src' not in sys.path:
    sys.path.append('../src')

import config 

os.makedirs(config.MODEL_PATH, exist_ok = True)

In [2]:
print("Loading processed data...")

X_train = pd.read_csv(os.path.join(config.PROCESSED_DATA_PATH, 'X_train.csv'))

X_test = pd.read_csv(os.path.join(config.PROCESSED_DATA_PATH, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(config.PROCESSED_DATA_PATH, 'y_train.csv')).squeeze() # .squeeze() to convert DataFrame to Series
y_test = pd.read_csv(os.path.join(config.PROCESSED_DATA_PATH, 'y_test.csv')).squeeze()

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


Loading processed data...
X_train shape: (40000, 22)
y_train shape: (40000,)


### Model Training & Hyperparameter Tuning Strategy

1.  **Logistic Regression:** A simple, interpretable linear model that serves as a strong baseline.
2.  **Random Forest:** A powerful ensemble of decision trees, robust to non-linear relationships.
3.  **LightGBM:** A gradient boosting framework known for its high performance and efficiency.


In [3]:
models = {
    'LogisticRegression': LogisticRegression(random_state=config.RANDOM_STATE, max_iter=1000),
    'RandomForest': RandomForestClassifier(random_state=config.RANDOM_STATE),
    'XGBoost': XGBClassifier(random_state=config.RANDOM_STATE, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=config.RANDOM_STATE),
}

param_grids = {
    'LogisticRegression': {
        'C': [0.1, 1.0, 10.0]
    },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'learning_rate': [0.05, 0.1]
    },

    'LightGBM': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'num_leaves': [31, 50]
    },
}

In [None]:
skf = StratifiedKFold(n_splits= 5, shuffle= True, random_state= config.RANDOM_STATE)

results = []

for model_name, model in models.items():
    print(f"Tuning {model_name}")

    param_grid = param_grids[model_name]

    grid_search = GridSearchCV(
        estimator= model, 
        param_grid= param_grid, 
        scoring= 'f1_macro',
        cv = skf, 
        n_jobs= -1, 
        verbose = 1
    )

    grid_search.fit(X_train, y_train)


    results.append({
        'Model': model_name,
        'Best Score (f1_macro)': grid_search.best_score_,
        'Best Params': grid_search.best_params_
    })
    
    print(f"Completed tuning for {model_name}. Best F1 Macro: {grid_search.best_score_:.4f}\n")

In [None]:
results_df = pd.DataFrame(results).sort_values(by='Best Score (f1_macro)', ascending=False)

results_df.to_csv(os.path.join(config.MODEL_PATH, 'f1_macro.csv'), index=False)

print("--- Model Comparison --- ")
display(results_df)

In [None]:
best_params = results_df[results_df['Model'] == 'LightGBM']['Best Params'].iloc[0]
print(f"Best parameters for LightGBM: {best_params}")

final_model = LGBMClassifier(**best_params, random_state=config.RANDOM_STATE)

final_model.fit(X_train, y_train)

In [None]:
model_path = os.path.join(config.MODEL_PATH, 'final_lgbm_model.joblib')

joblib.dump(final_model, model_path)

print(f"Final model saved successfully to: {model_path}")

In [None]:
print("Files in the models directory:")
os.listdir(config.MODEL_PATH)