1️⃣ Setup and Library Imports

In [91]:
# Install required libraries if not already installed
!pip install wandb
!pip install python-dotenv
!pip install optuna wandb lightgbm python-dotenv



In [99]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import optuna
import joblib  # For saving trained models
import wandb  # For experiment tracking
from dotenv import load_dotenv

# Import visualization library
import matplotlib.pyplot as plt

# Import Scikit-learn utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier # Import RandomForestClassifier and GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score
from sklearn.decomposition import PCA

# Import preprocessing utilities
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
)

# Import LightGBM model
from lightgbm import LGBMClassifier

# Set a fixed random state for reproducibility
random_state = 42

# Improve Matplotlib font rendering
plt.rc("font", family="serif", size=12)

2️⃣ Google Drive Integration (For Google Colab Users)

In [93]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


3️⃣ Load Dataset

In [94]:
# Define dataset location (modify path if necessary)
DATASET_LOC = "/content/drive/MyDrive/EAFIT/SEGUNDO SEMESTRE/APRENDIZAJE AUTOMÁTICO/COMPETITION 1"
prepared_data_file = "train_file.csv"

# Load dataset into a Pandas DataFrame
df = pd.read_csv(os.path.join(DATASET_LOC, prepared_data_file))

# Display first few rows
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001800,1,1,1,1,0,2510.0,1983.0,140.0,180.0,1.0,2,0
1,LP002911,1,1,1,0,0,2787.0,1917.0,146.0,360.0,0.0,0,0
2,LP002364,1,1,0,0,0,14880.0,0.0,96.0,360.0,1.0,1,1
3,LP001924,1,0,0,0,0,3158.0,3053.0,89.0,360.0,1.0,0,1
4,LP002788,1,1,0,1,0,2454.0,2333.0,181.0,360.0,0.0,2,0


4️⃣ Integrate with Weights & Biases (W&B)

In [95]:
# Load environment variables from .env file
load_dotenv('../api_keys.env')

# Get the API key from environment variable
api_key = os.getenv('WANDB_API_KEY')

# Log in to wandb using the API key
wandb.login()


True

6️⃣ Data Preprocessing

In [96]:
# Identify numerical and categorical columns
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
cat_cols = ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
target = 'Loan_Status'

# Define preprocessing for numerical features (imputation + scaling)
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features (imputation + one-hot encoding)
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformations into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, X_train.select_dtypes(include=['int64', 'float64']).columns),
    ('cat', cat_transformer, X_train.select_dtypes(include=['object']).columns)
    ],
    remainder = 'passthrough'
)

In [97]:
# Initialize W&B
load_dotenv('../api_keys.env')
wandb.login()
wandb.init(project="ML_Optuna_Tuning", name="Hyperparameter_Optimization")

# Define objective functions for Optuna
def optimize_logistic(trial):
    params = {
        "C": trial.suggest_loguniform("C", 0.01, 10),
        "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear"]),
    }
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(**params, random_state=random_state))
    ])
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    return accuracy_score(y_train, y_pred_train)

def optimize_random_forest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    }
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(**params, random_state=random_state))
    ])
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    return accuracy_score(y_train, y_pred_train)

def optimize_gradient_boosting(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
    }
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier(**params, random_state=random_state))
    ])
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    return accuracy_score(y_train, y_pred_train)

def optimize_lightgbm(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
    }
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LGBMClassifier(**params, random_state=random_state))
    ])
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    return accuracy_score(y_train, y_pred_train)


In [100]:
# Optimize Logistic Regression
study_logistic = optuna.create_study(direction="maximize")
study_logistic.optimize(optimize_logistic, n_trials=20)

# Optimize Random Forest
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(optimize_random_forest, n_trials=20)

# Optimize Gradient Boosting
study_gb = optuna.create_study(direction="maximize")
study_gb.optimize(optimize_gradient_boosting, n_trials=20)

# Optimize LightGBM
study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(optimize_lightgbm, n_trials=20)


[I 2025-02-17 23:49:41,925] A new study created in memory with name: no-name-8319b4f4-beca-4384-a278-20c934724f04
  "C": trial.suggest_loguniform("C", 0.01, 10),
[I 2025-02-17 23:49:42,067] Trial 0 finished with value: 0.7989821882951654 and parameters: {'C': 0.15185118478201015, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7989821882951654.
  "C": trial.suggest_loguniform("C", 0.01, 10),
[I 2025-02-17 23:49:42,234] Trial 1 finished with value: 0.8575063613231552 and parameters: {'C': 1.7650000826846421, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8575063613231552.
  "C": trial.suggest_loguniform("C", 0.01, 10),
[I 2025-02-17 23:49:42,291] Trial 2 finished with value: 0.7989821882951654 and parameters: {'C': 0.32872748583428996, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8575063613231552.
  "C": trial.suggest_loguniform("C", 0.01, 10),
[I 2025-02-17 23:49:42,334] Trial 3 finished with value: 0.806615776081425 and parameters: {'C': 0.6204436499680256, 'solver':

[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147
[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] S

[I 2025-02-17 23:50:06,886] Trial 1 finished with value: 0.8549618320610687 and parameters: {'num_leaves': 93, 'learning_rate': 0.010151813039657598, 'n_estimators': 253}. Best is trial 1 with value: 0.8549618320610687.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),
[I 2025-02-17 23:50:07,026] Trial 2 finished with value: 0.8422391857506362 and parameters: {'num_leaves': 90, 'learning_rate': 0.036491052729906825, 'n_estimators': 63}. Best is trial 1 with value: 0.8549618320610687.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147
[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFro

[I 2025-02-17 23:50:07,176] Trial 3 finished with value: 1.0 and parameters: {'num_leaves': 71, 'learning_rate': 0.3141951100253282, 'n_estimators': 79}. Best is trial 3 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147

[I 2025-02-17 23:50:07,462] Trial 4 finished with value: 1.0 and parameters: {'num_leaves': 33, 'learning_rate': 0.15665058949826557, 'n_estimators': 193}. Best is trial 3 with value: 1.0.



[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),
[I 2025-02-17 23:50:07,688] Trial 5 finished with value: 0.9669211195928753 and parameters: {'num_leaves': 44, 'learning_rate': 0.043654907931658944, 'n_estimators': 159}. Best is trial 3 with value: 1.0.


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),
[I 2025-02-17 23:50:07,858] Trial 6 finished with value: 1.0 and parameters: {'num_leaves': 51, 'learning_rate': 0.26771981589476646, 'n_estimators': 108}. Best is trial 3 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147

[I 2025-02-17 23:50:08,031] Trial 7 finished with value: 1.0 and parameters: {'num_leaves': 95, 'learning_rate': 0.4084032768594505, 'n_estimators': 86}. Best is trial 3 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),



[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147

[I 2025-02-17 23:50:08,275] Trial 8 finished with value: 0.8473282442748091 and parameters: {'num_leaves': 73, 'learning_rate': 0.018338001015465018, 'n_estimators': 131}. Best is trial 3 with value: 1.0.



[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),
[I 2025-02-17 23:50:08,428] Trial 9 finished with value: 0.9745547073791349 and parameters: {'num_leaves': 65, 'learning_rate': 0.12825800332989495, 'n_estimators': 61}. Best is trial 3 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


[I 2025-02-17 23:50:08,895] Trial 10 finished with value: 1.0 and parameters: {'num_leaves': 20, 'learning_rate': 0.48896178847242316, 'n_estimators': 294}. Best is trial 3 with value: 1.0.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


[I 2025-02-17 23:50:09,396] Trial 11 finished with value: 1.0 and parameters: {'num_leaves': 22, 'learning_rate': 0.1451031500625328, 'n_estimators': 205}. Best is trial 3 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


[I 2025-02-17 23:50:09,819] Trial 12 finished with value: 1.0 and parameters: {'num_leaves': 79, 'learning_rate': 0.17093085352083465, 'n_estimators': 206}. Best is trial 3 with value: 1.0.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147

[I 2025-02-17 23:50:10,219] Trial 13 finished with value: 0.9974554707379135 and parameters: {'num_leaves': 57, 'learning_rate': 0.08178443125220872, 'n_estimators': 176}. Best is trial 3 with value: 1.0.



[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),




[I 2025-02-17 23:50:10,712] Trial 14 finished with value: 1.0 and parameters: {'num_leaves': 36, 'learning_rate': 0.24653172848865357, 'n_estimators': 237}. Best is trial 3 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


[I 2025-02-17 23:50:10,983] Trial 15 finished with value: 0.9949109414758269 and parameters: {'num_leaves': 69, 'learning_rate': 0.08896144092003286, 'n_estimators': 134}. Best is trial 3 with value: 1.0.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


[I 2025-02-17 23:50:11,354] Trial 16 finished with value: 1.0 and parameters: {'num_leaves': 31, 'learning_rate': 0.3119302221282322, 'n_estimators': 176}. Best is trial 3 with value: 1.0.


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),




[I 2025-02-17 23:50:11,782] Trial 17 finished with value: 1.0 and parameters: {'num_leaves': 81, 'learning_rate': 0.18102992439050586, 'n_estimators': 216}. Best is trial 3 with value: 1.0.


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147

  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),
[I 2025-02-17 23:50:12,061] Trial 18 finished with value: 0.9923664122137404 and parameters: {'num_leaves': 29, 'learning_rate': 0.10658969405651991, 'n_estimators': 108}. Best is trial 3 with value: 1.0.





  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.5),


[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


[I 2025-02-17 23:50:12,563] Trial 19 finished with value: 1.0 and parameters: {'num_leaves': 55, 'learning_rate': 0.0661320171959236, 'n_estimators': 271}. Best is trial 3 with value: 1.0.


In [101]:
# Get best parameters
best_params = {
    "LogisticRegression": study_logistic.best_params,
    "RandomForest": study_rf.best_params,
    "GradientBoosting": study_gb.best_params,
    "LightGBM": study_lgbm.best_params,
}

# Train best models
models = {
    "LogisticRegression": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(**best_params["LogisticRegression"], random_state=random_state))
    ]),
    "RandomForest": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(**best_params["RandomForest"], random_state=random_state))
    ]),
    "GradientBoosting": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier(**best_params["GradientBoosting"], random_state=random_state))
    ]),
    "LightGBM": Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", LGBMClassifier(**best_params["LightGBM"], random_state=random_state))
    ])
}

# Train models and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    train_acc = accuracy_score(y_train, y_pred_train)

    print(f"📌 {name} Best Params: {best_params[name]}")
    print(f"Train Accuracy: {train_acc:.4f}")

    # Log results to W&B
    wandb.log({f"{name} Best Params": best_params[name], f"{name} Train Accuracy": train_acc})

wandb.finish()

📌 LogisticRegression Best Params: {'C': 9.242578028277334, 'solver': 'lbfgs'}
Train Accuracy: 1.0000
📌 RandomForest Best Params: {'n_estimators': 194, 'max_depth': 20, 'min_samples_split': 2}
Train Accuracy: 0.8753
📌 GradientBoosting Best Params: {'n_estimators': 125, 'learning_rate': 0.442962950558036, 'max_depth': 5}
Train Accuracy: 1.0000




[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147
📌 LightGBM Best Params: {'num_leaves': 71, 'learning_rate': 0.3141951100253282, 'n_estimators': 79}
Train Accuracy: 1.0000




0,1
GradientBoosting Train Accuracy,▁
LightGBM Train Accuracy,▁
LogisticRegression Train Accuracy,▁
RandomForest Train Accuracy,▁

0,1
GradientBoosting Train Accuracy,1.0
LightGBM Train Accuracy,1.0
LogisticRegression Train Accuracy,1.0
RandomForest Train Accuracy,0.87532


8️⃣ Train Logistic Regression Model

In [57]:
# Define and train Logistic Regression model
logistic_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=random_state))
])
# Train the model
logistic_model.fit(X_train, y_train)


In [58]:
# start a new wandb run to track this script
config=rf.best_params_
config['method'] = 'LogisticRegression'
wandb.init(
    # set the wandb project where this run will be logged
    project="COMP 1",
    name = "LogisticRegression",
    # track hyperparameters and run metadata
    config=config
)
best_index = rf.best_index_
best_results = rf.cv_results_
for metric in scorers.keys():
    wandb.log({
        f'best_{metric}_mean': best_results[f'mean_test_{metric}'][best_index],
        f'best_{metric}_std': best_results[f'std_test_{metric}'][best_index],
    })
wandb.finish()

0,1
best_mae_mean,▁
best_mae_std,▁
best_r2_mean,▁
best_r2_std,▁

0,1
best_mae_mean,0.33029
best_mae_std,0.02673
best_r2_mean,0.27733
best_r2_std,0.09217


9️⃣ Train LightGBM Model

In [59]:
# Define and train LightGBM model
lgbm_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=random_state))
])

# Train the model
lgbm_model.fit(X_train, y_train)



[LightGBM] [Info] Number of positive: 262, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 393, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666667 -> initscore=0.693147
[LightGBM] [Info] Start training from score 0.693147


In [60]:
# start a new wandb run to track this script
config=rf.best_params_
config['method'] = 'LGBMClassifier'
wandb.init(
    # set the wandb project where this run will be logged
    project="COMP 1",
    name = "LGBMClassifier",
    # track hyperparameters and run metadata
    config=config
)
best_index = rf.best_index_
best_results = rf.cv_results_
for metric in scorers.keys():
    wandb.log({
        f'best_{metric}_mean': best_results[f'mean_test_{metric}'][best_index],
        f'best_{metric}_std': best_results[f'std_test_{metric}'][best_index],
    })
wandb.finish()

0,1
best_mae_mean,▁
best_mae_std,▁
best_r2_mean,▁
best_r2_std,▁

0,1
best_mae_mean,0.33029
best_mae_std,0.02673
best_r2_mean,0.27733
best_r2_std,0.09217


🔟 Train Gradient Boosting Model

In [61]:
# Define and train Gradient Boosting model
gb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=random_state))
])

# Train the model
gb_model.fit(X_train, y_train)

In [79]:
X_test_prep = pd.DataFrame(X_test_prep, columns=X_train.columns)

# start a new wandb run to track this script
config=gb.best_params_
config['method'] = 'GradientBoostingRegressor'
wandb.init(
    # set the wandb project where this run will be logged
    project="COMP 1",
    name = "GradientBoostingRegressor",
    # track hyperparameters and run metadata
    config=config
)
best_index = gb.best_index_
best_results = gb.cv_results_
for metric in scorers.keys():
    wandb.log({
        'r2_test': gb_model.score(X_test_prep, y_test), # Now X_test_prep is a DataFrame
        'mae_test': mean_absolute_error(y_test, gb_model.predict(X_test_prep)),
    })
wandb.finish()

0,1
mae_test,▁▁
r2_test,▁▁

0,1
mae_test,0.57138
r2_test,-1.14319


1️⃣1️⃣ Train Random Forest Model

In [80]:
# Define and train Random Forest model
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=random_state))
])

# Train the model
rf_model.fit(X_train, y_train)


In [81]:
# start a new wandb run to track this script
config=gb.best_params_
config['method'] = 'RandomForestRegressor'
wandb.init(
    # set the wandb project where this run will be logged
    project="COMP 1",
    name = "RandomForestRegressor",
    # track hyperparameters and run metadata
    config=config
)
best_index = gb.best_index_
best_results = gb.cv_results_
for metric in scorers.keys():
    wandb.log({
        'r2_test': gb_model.score(X_test_prep, y_test),
        'mae_test': mean_absolute_error(y_test, gb_model.predict(X_test_prep)),
    })
wandb.finish()

0,1
mae_test,▁▁
r2_test,▁▁

0,1
mae_test,0.57138
r2_test,-1.14319
