In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import klib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb


In [2]:
# Load the dataset
data = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv", index_col='id')

# Separate features and target variable
X = data.drop('Response', axis=1)
y = data['Response']

# Determine sample size (10% of the dataset)
sample_size = 0.1

# Stratified sampling
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)

# Combine sampled features and target variable
data_sampled = pd.concat([X_sample, y_sample], axis=1)


In [3]:
# Transform binary variables
data_sampled['Gender'] = data_sampled['Gender'].map({'Male': 1, 'Female': 0})
data_sampled['Vehicle_Damage'] = data_sampled['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

# Ordinal Encoding for Vehicle_Age
vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
data_sampled['Vehicle_Age'] = data_sampled['Vehicle_Age'].map(vehicle_age_mapping)

# Drop Driving_License due to limited variability
data_sampled = data_sampled.drop(['Driving_License'], axis=1)

# Convert specific columns to categorical
data_sampled[['Gender', 'Vehicle_Damage', 'Vehicle_Age', 'Response']] = data_sampled[['Gender', 'Vehicle_Damage', 'Vehicle_Age', 'Response']].astype('category')

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return filtered_df

data_sampled = remove_outliers_iqr(data_sampled, 'Annual_Premium')

# Min-Max Scaling for Age
data_sampled['Age_MinMax'] = (data_sampled['Age'] - data_sampled['Age'].min()) / (data_sampled['Age'].max() - data_sampled['Age'].min())

# Min-Max Scaling for Vintage
data_sampled['Vintage_MinMax'] = (data_sampled['Vintage'] - data_sampled['Vintage'].min()) / (data_sampled['Vintage'].max() - data_sampled['Vintage'].min())

# Ensure correct columns are used for the model
data_sampled = data_sampled.drop(columns=['Age', 'Vintage'])

# Separate numerical features
numerical_features = ['Annual_Premium', 'Age_MinMax', 'Vintage_MinMax']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical features
data_sampled[numerical_features] = scaler.fit_transform(data_sampled[numerical_features])

data_sampled = klib.data_cleaning(data_sampled)

# Separate features and target variable
X_sampled = data_sampled.drop('response', axis=1)
y_sampled = data_sampled['response'].cat.codes  # Convert categorical target to numeric codes


Shape of cleaned data: (911974, 10) - Remaining NAs: 0


Dropped rows: 1
     of which 1 duplicates. (Rows (first 150 shown): [3685020])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 62.69 MB (-74.25%)



In [4]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)


In [5]:
# LightGBM parameters
params = {
    'n_estimators': 5000,
    'num_leaves': 14,
    'min_child_samples': 44,
    'learning_rate': 0.013082848414054271,
    'max_bin': 1024,
    'colsample_bytree': 0.7020907928739494,
    'reg_alpha': 2.8809013344332164,
    'reg_lambda': 0.501392057176914,
    'n_jobs': -1
}

# Initialize the LightGBM model
model = lgb.LGBMClassifier(**params, verbose=1)

# Train the model
model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 88243, number of negative: 641336
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 729579, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120951 -> initscore=-1.983459
[LightGBM] [Info] Start training from score -1.983459


In [6]:
# Make predictions
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
y_val_pred_proba = model.predict_proba(X_val)[:, 1]

# Calculate ROC AUC scores
roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

# Print ROC AUC scores
print(f'Training ROC AUC Score: {roc_auc_train}')
print(f'Validation ROC AUC Score: {roc_auc_val}')


Training ROC AUC Score: 0.8904216971404152
Validation ROC AUC Score: 0.8846986856632532
