In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import klib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb


In [2]:
# Load the dataset
data = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv", index_col='id')

# Separate features and target variable
X = data.drop('Response', axis=1)
y = data['Response']

# Determine sample size (10% of the dataset)
sample_size = 0.1

# Stratified sampling
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)

# Combine sampled features and target variable
data_sampled = pd.concat([X_sample, y_sample], axis=1)


In [None]:
# Transform binary variables
data_sampled['Gender'] = data_sampled['Gender'].map({'Male': 1, 'Female': 0})
data_sampled['Vehicle_Damage'] = data_sampled['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

# Ordinal Encoding for Vehicle_Age
vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
data_sampled['Vehicle_Age'] = data_sampled['Vehicle_Age'].map(vehicle_age_mapping)

# Drop Driving_License due to limited variability
data_sampled = data_sampled.drop(['Driving_License'], axis=1)

# Convert specific columns to categorical
data_sampled[['Gender', 'Vehicle_Damage', 'Vehicle_Age', 'Response']] = data_sampled[['Gender', 'Vehicle_Damage', 'Vehicle_Age', 'Response']].astype('category')

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return filtered_df

data_sampled = remove_outliers_iqr(data_sampled, 'Annual_Premium')

# Min-Max Scaling for Age
data_sampled['Age_MinMax'] = (data_sampled['Age'] - data_sampled['Age'].min()) / (data_sampled['Age'].max() - data_sampled['Age'].min())

# Min-Max Scaling for Vintage
data_sampled['Vintage_MinMax'] = (data_sampled['Vintage'] - data_sampled['Vintage'].min()) / (data_sampled['Vintage'].max() - data_sampled['Vintage'].min())

# Ensure correct columns are used for the model
data_sampled = data_sampled.drop(columns=['Age', 'Vintage'])

# Separate numerical features
numerical_features = ['Annual_Premium', 'Age_MinMax', 'Vintage_MinMax']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
data_sampled[numerical_features] = scaler.fit_transform(data_sampled[numerical_features])

data_sampled = klib.data_cleaning(data_sampled)

# Separate features and target variable
X_sampled = data_sampled.drop('response', axis=1)
y_sampled = data_sampled['response'].cat.codes  # Convert categorical target to numeric codes


Shape of cleaned data: (911974, 10) - Remaining NAs: 0


Dropped rows: 1
     of which 1 duplicates. (Rows (first 150 shown): [3685020])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 62.69 MB (-74.25%)



In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)


In [None]:
import optuna
from datetime import datetime
import os
import logging
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Objective function for Optuna
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'lambda_l1': trial.suggest_float('lambda_l1', 0.01, 1.0, log=True),  # Based on reg_alpha
        'lambda_l2': trial.suggest_float('lambda_l2', 0.1, 0.5, log=True),  # Based on reg_lambda
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.7, 0.8, 0.9, 1.0]),  # Existing values
        'subsample': trial.suggest_categorical('subsample', [0.7, 0.8, 1.0]),  # Existing values
        'learning_rate': trial.suggest_categorical('learning_rate', [0.012, 0.013, 0.014]),  # Focused range
        'n_estimators': trial.suggest_int('n_estimators', 4500, 20000),  # Based on n_estimators
        'max_depth': trial.suggest_categorical('max_depth', [12, 14, 16]),  # Similar to existing values
        'num_leaves': trial.suggest_int('num_leaves', 31, 1024),  # Add num_leaves parameter
        'random_state': 42,
        'min_child_samples': trial.suggest_int('min_child_samples', 40, 50),  # Similar to existing values
        'max_bin': 1024  
    }

    model = lgb.LGBMClassifier(**param)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
    )
    
    y_val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_val_pred)
    return auc

logger.info("Starting hyperparameter optimization with Optuna.")
# Generate a unique filename for each run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
sqlite_file_path = os.path.join(r'C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\models_base', f'optuna_study_{timestamp}.db')

study = optuna.create_study(storage=f'sqlite:///{sqlite_file_path}', study_name=f'my_study_{timestamp}', direction='maximize')
study.optimize(objective, n_trials=25)

logger.info(f"Best trial parameters: {study.best_trial.params}")
logger.info(f"Best trial AUC: {study.best_trial.value}")


INFO:__main__:Starting hyperparameter optimization with Optuna.
[I 2024-07-18 00:42:04,184] A new study created in RDB with name: my_study_20240718_004203
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),  # Based on reg_alpha
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 0.5),  # Based on reg_lambda


[LightGBM] [Info] Number of positive: 88243, number of negative: 641336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1546
[LightGBM] [Info] Number of data points in the train set: 729579, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120951 -> initscore=-1.983459
[LightGBM] [Info] Start training from score -1.983459


[I 2024-07-18 00:43:28,598] Trial 0 finished with value: 0.8851531719951085 and parameters: {'lambda_l1': 0.5253788610520673, 'lambda_l2': 0.15374274547064645, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.013, 'n_estimators': 4802, 'max_depth': 16, 'min_child_samples': 48, 'max_bin': 1028}. Best is trial 0 with value: 0.8851531719951085.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),  # Based on reg_alpha
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.1, 0.5),  # Based on reg_lambda


[LightGBM] [Info] Number of positive: 88243, number of negative: 641336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019537 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1546
[LightGBM] [Info] Number of data points in the train set: 729579, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120951 -> initscore=-1.983459
[LightGBM] [Info] Start training from score -1.983459


[W 2024-07-18 00:43:31,909] Trial 1 failed with parameters: {'lambda_l1': 0.14192185240591346, 'lambda_l2': 0.19161403470196509, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.013, 'n_estimators': 4641, 'max_depth': 16, 'min_child_samples': 45, 'max_bin': 1028} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\paulo\AppData\Local\Temp\ipykernel_50704\1564379230.py", line 29, in objective
    model.fit(
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\site-packages\lightgbm\sklearn.py", line 1187, in fit
    super().fit(
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\site-packages\lightgbm\sklearn.py", line 885, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\

KeyboardInterrupt: 

In [None]:
# LightGBM parameters
params = study.best_trial

# Initialize the LightGBM model
model = lgb.LGBMClassifier(**params, verbose=1)

# Train the model
model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 88243, number of negative: 641336
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 729579, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120951 -> initscore=-1.983459
[LightGBM] [Info] Start training from score -1.983459


In [None]:
# Make predictions
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
y_val_pred_proba = model.predict_proba(X_val)[:, 1]

# Calculate ROC AUC scores
roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

# Print ROC AUC scores
print(f'Training ROC AUC Score: {roc_auc_train}')
print(f'Validation ROC AUC Score: {roc_auc_val}')


Training ROC AUC Score: 0.8904216971404152
Validation ROC AUC Score: 0.8846986856632532
