In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import klib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import optuna
from datetime import datetime
import os
from sklearn.model_selection import StratifiedKFold


In [2]:
# Load the datasets
train_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv", index_col='id')
test_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv", index_col='id')

In [3]:
# Preprocess data
def preprocess_data(df):
    # Transform binary variables
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    
    # Ordinal Encoding for Vehicle_Age
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_mapping)
    
    # Drop Driving_License due to limited variability
    df.drop(['Driving_License'], axis=1, inplace=True)
    
    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [4]:
train_df[['Gender', 'Vehicle_Age', 'Vehicle_Damage']] = train_df[['Gender', 'Vehicle_Age', 'Vehicle_Damage']].astype('int')
train_df[['Region_Code', 'Annual_Premium', 'Policy_Sales_Channel']] = train_df[['Region_Code', 'Annual_Premium', 'Policy_Sales_Channel']].astype('int')
test_df[['Gender', 'Vehicle_Age', 'Vehicle_Damage']] = test_df[['Gender', 'Vehicle_Age', 'Vehicle_Damage']].astype('int')
test_df[['Region_Code', 'Annual_Premium', 'Policy_Sales_Channel']] = test_df[['Region_Code', 'Annual_Premium', 'Policy_Sales_Channel']].astype('int')

In [5]:
# Remove outliers from Annual_Premium in training data only
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

train_df = remove_outliers_iqr(train_df, 'Annual_Premium')

In [6]:
def feature_engineering(df):
    df = df.copy()  
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]

    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

In [7]:
def optimize_dtypes(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            if 'int' in str(col_type):
                min_val, max_val = df[col].min(), df[col].max()
                if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif min_val >= np.iinfo(np.int16).min and max_val <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif min_val >= np.iinfo(np.int32).min and max_val <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif min_val >= np.iinfo(np.int64).min and max_val <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif 'float' in str(col_type):
                min_val, max_val = df[col].min(), df[col].max()
                if min_val >= np.finfo(np.float16).min and max_val <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif min_val >= np.finfo(np.float32).min and max_val <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif min_val >= np.finfo(np.float64).min and max_val <= np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
    
    return df

train_df = optimize_dtypes(train_df)
test_df = optimize_dtypes(test_df)


In [8]:
# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response'] 

In [14]:
param = {
    'objective': 'binary',
    'metric': 'auc',
    'reg_alpha': 0.03432385172267505,
    'reg_lambda': 0.2998279059616829,
    'colsample_bytree': 0.790292183596673,
    'subsample': 0.9046878168822107,
    'learning_rate': 0.05035039561309864,
    'max_depth': 29,
    'num_leaves': 1474,
    'min_child_samples': 75,
    'min_child_weight': 7.661448090878849,
    'min_split_gain': 0.09978597066868167,
    'max_bin': 499,
    'scale_pos_weight': 9.870717062897523,
    'n_jobs': 8
}

In [15]:
# Train final model on entire training data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
train_data = lgb.Dataset(X_scaled, label=y)
bst = lgb.train(
    param,
    train_data,
    num_boost_round=1000
)

[LightGBM] [Info] Number of positive: 1103676, number of negative: 8023849
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079596 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2027
[LightGBM] [Info] Number of data points in the train set: 9127525, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120917 -> initscore=-1.983772
[LightGBM] [Info] Start training from score -1.983772


In [16]:
# Make predictions on the test set
test_df_scaled = scaler.transform(test_df)
test_predictions = bst.predict(test_df_scaled)
submission = pd.DataFrame({'id': test_df.index, 'Response': test_predictions})
submission.to_csv("submission.csv", index=False)

print("Submission file created successfully.")

Submission file created successfully.
