### Objective: To predict which customers respond positively to an automobile insurance offer

#### Step 1: Exploratory Data Analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Importing Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
import lightgbm as lgb
from lightgbm import early_stopping

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Read training file
train_df = pd.read_csv("/content/drive/MyDrive/Kaggle - Insurance Classification - Data/train.csv")
train_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [None]:
# Read test file
test_df = pd.read_csv("/content/drive/MyDrive/Kaggle - Insurance Classification - Data/test.csv")
test_df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [None]:
# Identify categorical columns
categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()

# Optionally, you can also include columns with a small number of unique values
threshold = 20  # This can be adjusted based on your dataset
for col in train_df.columns:
    if train_df[col].nunique() < threshold and col not in categorical_columns:
        categorical_columns.append(col)

print("Categorical Columns:", categorical_columns)

Categorical Columns: ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Driving_License', 'Previously_Insured', 'Response']


In [None]:
# Remove 'Response' from catgeorical column list
categorical_columns = [col for col in categorical_columns if col not in ['Response']]
print("Categorical Columns:", categorical_columns)

Categorical Columns: ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Driving_License', 'Previously_Insured']


In [None]:
# Convert 'Response' to integer
train_df["Response"] = train_df["Response"].astype(int)

In [None]:
# Convert categorical columns to category dtype
for col in categorical_columns:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [None]:
# Remove 'id' column from both train and test data and store it separately
id = test_df['id']
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

dtype('int64')

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df.drop('Response', axis=1), train_df['Response'], test_size=0.2, random_state=42)

In [None]:
# Prepare LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns)
valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_columns)

In [None]:
# Set parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [None]:
# Train model
bst = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    num_boost_round=1000,
    callbacks=[early_stopping(stopping_rounds=10)] # Use early stopping callback
)

LightGBMError: Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.

In [None]:
# Make predictions on the validation set
val_predictions = bst.predict(X_val)

In [None]:
# Evaluate the model
val_pred_labels = (val_predictions > 0.5).astype(int) # Because our target variable is binary
accuracy = accuracy_score(y_val, val_pred_labels)
roc_auc = roc_auc_score(y_val, val_predictions)

print(f'Validation Accuracy: {accuracy}')
print(f'Validation AUC-ROC: {roc_auc}')

Validation Accuracy: 0.8805416000278145
Validation AUC-ROC: 0.879647307976395


In [None]:
# Extract feature importance
importance = bst.feature_importance()
feature_names = bst.feature_name()

# Create a DataFrame for visualization
important_features = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

# Sort the DataFrame by importance
feature_importance = important_features.sort_values(by='Importance', ascending=False)
important_features

Unnamed: 0,Feature,Importance
9,Vintage,8026
7,Annual_Premium,5521
1,Age,4079
8,Policy_Sales_Channel,3949
3,Region_Code,3871
5,Vehicle_Age,1221
6,Vehicle_Damage,795
4,Previously_Insured,405
0,Gender,314
2,Driving_License,79


In [None]:
# Make predictions on the test set
test_predictions = bst.predict(test_df)

In [None]:
len(test_predictions)

7669866

In [None]:
# Prepare submission file
submission = pd.DataFrame({'id': id, 'Response': test_predictions})
submission.to_csv('submission_exp2a.csv', index=False)