In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import klib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [3]:
data = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv", index_col='id')

In [4]:
# Transform binary variables
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})
data['Vehicle_Damage'] = data['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

# Ordinal Encoding for Vehicle_Age
vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
data['Vehicle_Age'] = data['Vehicle_Age'].map(vehicle_age_mapping)

# Drop Driving_License due to limited variability
data = data.drop(['Driving_License'], axis=1)

# Convert specific columns to categorical
data[['Gender', 'Vehicle_Damage', 'Vehicle_Age', 'Response']] = data[['Gender', 'Vehicle_Damage', 'Vehicle_Age', 'Response']].astype('category')


In [5]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return filtered_df

data = remove_outliers_iqr(data, 'Annual_Premium')


In [6]:
# Min-Max Scaling for Age
data['Age_MinMax'] = (data['Age'] - data['Age'].min()) / (data['Age'].max() - data['Age'].min())

# Min-Max Scaling for Vintage
data['Vintage_MinMax'] = (data['Vintage'] - data['Vintage'].min()) / (data['Vintage'].max() - data['Vintage'].min())

# Ensure correct columns are used for the model
data = data.drop(columns=['Age', 'Vintage'])

In [7]:
# Separate numerical features
numerical_features = ['Annual_Premium', 'Age_MinMax', 'Vintage_MinMax']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [8]:
data = klib.data_cleaning(data)

Shape of cleaned data: (9127427, 10) - Remaining NAs: 0


Dropped rows: 98
     of which 98 duplicates. (Rows (first 150 shown): [433434, 839101, 1006519, 1365283, 1978892, 2006024, 2110307, 2302603, 2756753, 3112366, 3263068, 3583603, 3658204, 3757094, 4051657, 4069226, 4132546, 4183892, 4301485, 4486942, 4674845, 4693332, 4794307, 4925570, 5439925, 5490635, 5518941, 5920900, 5956056, 6161230, 6216481, 6303698, 6338755, 6460662, 6763566, 6791498, 6899806, 7271540, 7364821, 7494791, 7497775, 7502808, 7507806, 7623896, 7628432, 7639654, 7713039, 7727986, 7728618, 7741262, 7824114, 7826399, 7926908, 7984162, 8090139, 8311705, 8502791, 8541787, 8829656, 8886924, 9088295, 9350643, 9550005, 9616290, 9840964, 9860117, 9929138, 9954342, 9969977, 10119491, 10141505, 10339073, 10371360, 10384902, 10385979, 10582789, 10582882, 10638022, 10653241, 10679632, 10738253, 10741384, 10964704, 10969520, 10993665, 11083728, 11091415, 11097600, 11125885, 11186296, 11205897, 11276012, 11311033, 11331601, 1

In [9]:
# Separate features and target variable
X = data.drop('response', axis=1)
y = data['response'].cat.codes  # Convert categorical target to numeric codes

In [10]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# LightGBM parameters
params = {
    'n_estimators': 5000,
    'num_leaves': 14,
    'min_child_samples': 44,
    'learning_rate': 0.013082848414054271,
    'max_bin': 1024,
    'colsample_bytree': 0.7020907928739494,
    'reg_alpha': 2.8809013344332164,
    'reg_lambda': 0.501392057176914,
    'n_jobs': -1
}

# Initialize the LightGBM model
model = lgb.LGBMClassifier(**params, verbose=1)

# Train the model
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 882934, number of negative: 6419007
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042921 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1547
[LightGBM] [Info] Number of data points in the train set: 7301941, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120918 -> initscore=-1.983768
[LightGBM] [Info] Start training from score -1.983768


In [12]:
# Make predictions
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
y_val_pred_proba = model.predict_proba(X_val)[:, 1]

# Calculate ROC AUC scores
roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

# Print ROC AUC scores
print(f'Training ROC AUC Score: {roc_auc_train}')
print(f'Validation ROC AUC Score: {roc_auc_val}')

Training ROC AUC Score: 0.8872325863953647
Validation ROC AUC Score: 0.8868501314820557
