In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import klib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.cluster import KMeans
import optuna
from datetime import datetime
import os
import logging
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [None]:
train_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv", index_col='id')

In [None]:
# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response']

# Determine sample size (10% of the dataset)
sample_size = 0.4

# Stratified sampling
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y)

# Combine sampled features and target variable
train_df = pd.concat([X_sample, y_sample], axis=1)

In [None]:
# Preprocess data
def preprocess_data(df):
    # Transform binary variables
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    
    # Ordinal Encoding for Vehicle_Age
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_mapping)
    
    # Drop Driving_License due to limited variability
    df.drop(['Driving_License'], axis=1, inplace=True)
    
    # Min-Max Scaling for Age and Vintage
    df['Age'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())
    df['Vintage'] = (df['Vintage'] - df['Vintage'].min()) / (df['Vintage'].max() - df['Vintage'].min())
    
    return df

train_data = preprocess_data(train_df)

In [None]:
train_df[['Gender', 'Vehicle_Age', 'Vehicle_Damage']] = train_df[['Gender', 'Vehicle_Age', 'Vehicle_Damage']].astype('int')

In [None]:
# Remove outliers from Annual_Premium in training data only
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

train_df = remove_outliers_iqr(train_df, 'Annual_Premium')

In [None]:
def group_rare_categories(df, column, threshold=0.01):
    category_freq = df[column].value_counts(normalize=True)
    rare_categories = category_freq[category_freq < threshold].index
    
    # Use .loc to avoid SettingWithCopyWarning
    df.loc[df[column].isin(rare_categories), column] = 88
    
    return df

categorical = ['Region_Code', 'Policy_Sales_Channel']
for col in categorical:
    train_df = group_rare_categories(train_df, col, 0.01)


In [None]:
def feature_engineering(df):
    df = df.copy()  # Create a copy to avoid the warning
    df.loc[:, 'Age_Vehicle_Age'] = df['Age'] * df['Vehicle_Age']
    df.loc[:, 'Age_Previously_Insured'] = df['Age'] * df['Previously_Insured']
    df.loc[:, 'Vehicle_Age_Damage'] = df['Vehicle_Age'] * df['Vehicle_Damage']
    df.loc[:, 'Previously_Insured_Damage'] = df['Previously_Insured'] * df['Vehicle_Damage']
    df.loc[:, 'Age_squared'] = df['Age'] ** 2
    df.loc[:, 'Vehicle_Age_squared'] = df['Vehicle_Age'] ** 2
    df.loc[:, 'Annual_Premium_per_Age'] = df['Annual_Premium'] / (df['Age'] + 1)
    return df

train_df = feature_engineering(train_df)

In [None]:
# Apply KMeans clustering
optimal_clusters = 4
train_df['Cluster'] = KMeans(n_clusters=optimal_clusters, random_state=42).fit_predict(train_df)

In [None]:
def optimize_dtypes(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            if 'int' in str(col_type):
                min_val, max_val = df[col].min(), df[col].max()
                if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif min_val >= np.iinfo(np.int16).min and max_val <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif min_val >= np.iinfo(np.int32).min and max_val <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif min_val >= np.iinfo(np.int64).min and max_val <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif 'float' in str(col_type):
                min_val, max_val = df[col].min(), df[col].max()
                if min_val >= np.finfo(np.float16).min and max_val <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif min_val >= np.finfo(np.float32).min and max_val <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif min_val >= np.finfo(np.float64).min and max_val <= np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
    
    return df

train_df = optimize_dtypes(train_df)

In [None]:
train_df.info()

In [None]:
# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response'] 

In [None]:
scaler = StandardScaler()
train_df =  scaler.fit_transform(X)

In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
param = {
    'objective': 'binary',
    'metric': 'auc',
    'lambda_l1': 0.02,  
    'lambda_l2': 0.1,  
    'colsample_bytree': 0.75, 
    'subsample': 0.85, 
    'learning_rate':  0.09,  
    'max_depth': 32, 
    'num_leaves': 900, 
    'min_child_samples': 50,  
    'min_child_weight': 9.0,  
    'min_split_gain': 0.09,  
    'max_bin': 420,
    'scale_pos_weight': 10,
    'early_stopping_round': 100,
    'n_jobs': -1,
    }

num_boost_round = 20000

# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
# Train model
bst = lgb.train(
    param,
    train_data, 
    valid_sets=[valid_data],
    num_boost_round=num_boost_round,
    )
    
# Predict and evaluate
y_val_pred = bst.predict(X_val, num_iteration=bst.best_iteration)
auc = roc_auc_score(y_val, y_val_pred)

# Make predictions
y_train_pred_proba = bst.predict(X_train, num_iteration=bst.best_iteration)
y_val_pred_proba = bst.predict(X_val, num_iteration=bst.best_iteration)

# Calculate ROC AUC scores
roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

# Print ROC AUC scores
print(f'Training ROC AUC Score: {roc_auc_train}')
print(f'Validation ROC AUC Score: {roc_auc_val}')

In [None]:
# import optuna
# import lightgbm as lgb
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import train_test_split
# from datetime import datetime
# import os

# def objective(trial):
#     param = {
#         'objective': 'binary',
#         'metric': 'auc',
#         'lambda_l1': trial.suggest_float('reg_alpha', 0.01, 0.1, log=True),  
#         'lambda_l2': trial.suggest_float('reg_lambda', 0.1, 0.3, log=True),  
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.75, 0.85), 
#         'subsample': trial.suggest_float('subsample', 0.85, 1.0), 
#         'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.1),  
#         'max_depth': trial.suggest_int('max_depth', 16, 32), 
#         'num_leaves': trial.suggest_int('num_leaves', 700, 1500), 
#         'min_child_samples': trial.suggest_int('min_child_samples', 20, 80),  
#         'min_child_weight': trial.suggest_float('min_child_weight', 4.0, 10.0),  
#         'min_split_gain': trial.suggest_float('min_split_gain', 0.05, 0.1),  
#         'max_bin': trial.suggest_int('max_bin', 400, 500),
#         'verbose': -1,  
#         'scale_pos_weight': trial.suggest_float('scale_pos_weight', 9, 12),
#         'early_stopping_round': 100,
#     }

#     num_boost_round = trial.suggest_int('num_boost_round', 10000, 20000)

#     # Create LightGBM dataset
#     train_data = lgb.Dataset(X_train, label=y_train)
#     valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
#     # Train model
#     bst = lgb.train(
#         param,
#         train_data, 
#         num_boost_round=num_boost_round,
#         valid_sets=[valid_data],
#     )
    
#     # Predict and evaluate
#     y_val_pred = bst.predict(X_val, num_iteration=bst.best_iteration)
#     auc = roc_auc_score(y_val, y_val_pred)
#     return auc

# # Generate a unique filename for each run
# timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# sqlite_file_path = os.path.join(r'C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling', f'optuna_study_{timestamp}.db')

# # Create a study and optimize
# study = optuna.create_study(storage=f'sqlite:///{sqlite_file_path}', study_name=f'my_study_{timestamp}', direction='maximize')
# study.optimize(objective, n_trials=10)

# # Get the best trial
# best_trial = study.best_trial
# print(f'Best trial score: {best_trial.value}')
# print(f'Best trial params: {best_trial.params}')

