In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans

In [2]:
# Load the datasets
train_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv", index_col='id')
test_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv", index_col='id')

In [3]:
# Transform binary variables
train_df['Gender'] = train_df['Gender'].map({'Male': 1, 'Female': 0})
train_df['Vehicle_Damage'] = train_df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

# Drop Driving_License due to limited variability
train_df = train_df.drop(['Driving_License'], axis=1)

In [4]:
# Handle continuous variables
continuous_numeric = ['Age', 'Vintage', 'Annual_Premium']
Q1 = train_df['Annual_Premium'].quantile(0.25)
Q3 = train_df['Annual_Premium'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
train_df['Outlier_Annual_Premium'] = ((train_df['Annual_Premium'] < lower_bound) | (train_df['Annual_Premium'] > upper_bound)).astype(int)
train_df = train_df[(train_df['Annual_Premium'] >= lower_bound) & (train_df['Annual_Premium'] <= upper_bound)]
train_df = train_df.drop('Outlier_Annual_Premium', axis=1)

In [5]:
# Group rare categories in categorical variables
def group_rare_categories(df, column, threshold=0.01):
    category_freq = df[column].value_counts(normalize=True)
    rare_categories = category_freq[category_freq < threshold].index
    df[column] = df[column].apply(lambda x: 'Other' if x in rare_categories else x)
    return df

categorical = ['Region_Code', 'Policy_Sales_Channel']  # Removed 'Vehicle_Age' from categorical list
for col in categorical:
    train_df = group_rare_categories(train_df, col, 0.01)

In [6]:
# Ordinal Encoding for Vehicle_Age
vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
train_df['Vehicle_Age'] = train_df['Vehicle_Age'].map(vehicle_age_mapping)

In [7]:
# One-Hot Encoding for other categorical variables
train_df = pd.get_dummies(train_df, columns=categorical, drop_first=True)

# Check columns after one-hot encoding
print("Columns after one-hot encoding:", train_df.columns)

Columns after one-hot encoding: Index(['Gender', 'Age', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage',
       'Annual_Premium', 'Vintage', 'Response', 'Region_Code_3.0',
       'Region_Code_6.0', 'Region_Code_8.0', 'Region_Code_10.0',
       'Region_Code_11.0', 'Region_Code_14.0', 'Region_Code_15.0',
       'Region_Code_18.0', 'Region_Code_21.0', 'Region_Code_28.0',
       'Region_Code_29.0', 'Region_Code_30.0', 'Region_Code_33.0',
       'Region_Code_35.0', 'Region_Code_36.0', 'Region_Code_37.0',
       'Region_Code_41.0', 'Region_Code_45.0', 'Region_Code_46.0',
       'Region_Code_47.0', 'Region_Code_50.0', 'Region_Code_Other',
       'Policy_Sales_Channel_122.0', 'Policy_Sales_Channel_124.0',
       'Policy_Sales_Channel_152.0', 'Policy_Sales_Channel_154.0',
       'Policy_Sales_Channel_160.0', 'Policy_Sales_Channel_Other'],
      dtype='object')


In [8]:
# Feature engineering
def feature_engineering(df):
    df['Age_Vehicle_Age'] = df['Age'] * df['Vehicle_Age']
    df['Age_Previously_Insured'] = df['Age'] * df['Previously_Insured']
    df['Vehicle_Age_Damage'] = df['Vehicle_Age'] * df['Vehicle_Damage']
    df['Previously_Insured_Damage'] = df['Previously_Insured'] * df['Vehicle_Damage']
    df['Age_squared'] = df['Age'] ** 2
    df['Vehicle_Age_squared'] = df['Vehicle_Age'] ** 2
    df['Annual_Premium_per_Age'] = df['Annual_Premium'] / (df['Age'] + 1)
    return df

# Apply feature engineering
train_df = feature_engineering(train_df)

In [9]:
# Update the list of continuous variables to include newly created features
continuous_numeric = continuous_numeric + [
    'Age_Vehicle_Age', 'Age_Previously_Insured', 'Vehicle_Age_Damage', 
    'Previously_Insured_Damage', 'Age_squared', 'Vehicle_Age_squared', 
    'Annual_Premium_per_Age'
]

# Standardize the continuous variables
scaler = StandardScaler()
train_df[continuous_numeric] = scaler.fit_transform(train_df[continuous_numeric])

In [10]:
# Apply KMeans clustering
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(train_df[continuous_numeric])
train_df['Cluster'] = clusters

In [11]:
# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response']

In [12]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
import os
import datetime as datetime
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

# Define the objective function for Optuna
def objective(trial):
    # Suggest a value for the number of estimators
    n_estimators = trial.suggest_int('n_estimators', 14500, 17000)
    
    # Set the parameters for the LightGBM model
    params = {
        'n_estimators': n_estimators,
        'num_leaves': 14,
        'min_child_samples': 44,
        'learning_rate': 0.013082848414054271,
        'max_bin': 1024,
        'colsample_bytree': 0.7020907928739494,
        'reg_alpha': 2.8809013344332164,
        'reg_lambda': 0.501392057176914,
    }
    
    # Initialize the LightGBM model
    model = lgb.LGBMClassifier(**params, verbose=-1)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # Calculate the validation ROC AUC score
    roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)
    
    return roc_auc_val

# Generate a unique filename for each run
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
sqlite_file_path = os.path.join(r'C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\best_model', f'optuna_study_{timestamp}.db')

# Create the Optuna study with the TPESampler and HyperbandPruner
sampler = optuna.samplers.TPESampler(seed=42)  # Use a fixed seed for reproducibility
pruner = optuna.pruners.HyperbandPruner()

study = optuna.create_study(
    storage=f'sqlite:///{sqlite_file_path}', 
    study_name=f'my_study_{timestamp}', 
    direction='maximize', 
    sampler=sampler, 
    pruner=pruner
)
study.optimize(objective, n_trials=5)  # Increased number of trials

# Print the best number of estimators
print(f'Best n_estimators: {study.best_params["n_estimators"]}')
print(f'Best validation ROC AUC Score: {study.best_value}')


[I 2024-07-17 18:21:07,669] A new study created in RDB with name: my_study_20240717_182107
[I 2024-07-17 18:53:58,958] Trial 0 finished with value: 0.8877450685274885 and parameters: {'n_estimators': 15436}. Best is trial 0 with value: 0.8877450685274885.
[W 2024-07-17 19:17:33,934] Trial 1 failed with parameters: {'n_estimators': 16877} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\paulo\AppData\Local\Temp\ipykernel_34372\2981652374.py", line 31, in objective
    model.fit(X_train, y_train)
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\site-packages\lightgbm\sklearn.py", line 1187, in fit
    super().fit(
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\site-packages\lightgbm\sklearn.py", line 885, in fit
    self._Booster = trai

KeyboardInterrupt: 

In [None]:

# Train the final model with the best n_estimators
best_n_estimators = study.best_params["n_estimators"]
final_params = {
    'n_estimators': best_n_estimators,
    'num_leaves': 14,
    'min_child_samples': 44,
    'learning_rate': 0.013082848414054271,
    'max_bin': 1024,
    'colsample_bytree': 0.7020907928739494,
    'reg_alpha': 2.8809013344332164,
    'reg_lambda': 0.501392057176914,
    'n_jobs': -1,
}

final_model = lgb.LGBMClassifier(**final_params, verbose=1)
final_model.fit(X_train, y_train)

# Make predictions with the final model
y_train_pred_proba = final_model.predict_proba(X_train)[:, 1]
y_val_pred_proba = final_model.predict_proba(X_val)[:, 1]

# Calculate ROC AUC scores
roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

# Print ROC AUC scores
print(f'Training ROC AUC Score: {roc_auc_train}')
print(f'Validation ROC AUC Score: {roc_auc_val}')


[LightGBM] [Info] Number of positive: 882941, number of negative: 6419079
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2719
[LightGBM] [Info] Number of data points in the train set: 7302020, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120917 -> initscore=-1.983772
[LightGBM] [Info] Start training from score -1.983772
Training ROC AUC Score: 0.8889644160828131
Validation ROC AUC Score: 0.8877006285390077


In [None]:
# # LightGBM parameters
# params = {
#     'n_estimators': 5000,
#     'num_leaves': 14,
#     'min_child_samples': 44,
#     'learning_rate': 0.013082848414054271,
#     'max_bin': 1024,  # log_max_bin of 10 corresponds to 2^10 = 1024
#     'colsample_bytree': 0.7020907928739494,
#     'reg_alpha': 2.8809013344332164,
#     'reg_lambda': 0.501392057176914,
#     'n_jobs': -1,
# }

# # Initialize the LightGBM model
# model = lgb.LGBMClassifier(**params, verbose=1)

# # Train the model
# model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 882941, number of negative: 6419079
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2719
[LightGBM] [Info] Number of data points in the train set: 7302020, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120917 -> initscore=-1.983772
[LightGBM] [Info] Start training from score -1.983772


KeyboardInterrupt: 

In [None]:
# # Make predictions
# y_train_pred_proba = model.predict_proba(X_train)[:, 1]
# y_val_pred_proba = model.predict_proba(X_val)[:, 1]

# # Calculate ROC AUC scores
# roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
# roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

# # Print ROC AUC scores
# print(f'Training ROC AUC Score: {roc_auc_train}')
# print(f'Validation ROC AUC Score: {roc_auc_val}')

Training ROC AUC Score: 0.8851918290980479
Validation ROC AUC Score: 0.885032790155052
