In [18]:
# https://www.kaggle.com/datasets/anmolkumar/health-insurance-cross-sell-prediction/data
# https://datascience.stackexchange.com/questions/78146/does-label-encoding-an-entire-dataset-cause-data-leakage

In [19]:
# https://www.youtube.com/watch?v=WLwjvWq0GWA

In [20]:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import pickle

In [21]:
df = pd.read_csv("dataset.csv")

In [22]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [23]:
df['Response'].value_counts()

Response
0    334399
1     46710
Name: count, dtype: int64

In [24]:
df['Region_Code'] = df['Region_Code'].astype(str)
df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(str)

In [25]:
id_col = ['id']
target_col = 'Response'
cat_features = ['Gender', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']
num_features = ['Age', 'Annual_Premium', 'Vintage']

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_features)
    ],
    remainder='drop'
)

In [27]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

In [28]:
logistic_params = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs'],
    'classifier__max_iter': [100, 200, 300],
    'classifier__penalty': ['l2'],
    'classifier__class_weight': ['balanced', None]
}

In [29]:
X = df.drop(columns=['Response', 'id'])
y = df['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
logistic_random_search = RandomizedSearchCV(pipeline, param_distributions=logistic_params,
                                            n_iter=10, cv=5, random_state=42, scoring='recall', n_jobs=-1)

In [31]:
logistic_random_search.fit(X_train, y_train)

In [32]:
best_logistic_model = logistic_random_search.best_estimator_

In [33]:
y_pred = best_logistic_model.predict(X_test)

In [34]:
print(f"Best Score (Recall): {logistic_random_search.best_score_}")
print(f"Accuracy on Test Set: {accuracy_score(y_test, y_pred)}")
print(f"Precision on Test Set: {precision_score(y_test, y_pred)}")
print(f"Recall on Test Set: {recall_score(y_test, y_pred)}")
print(f"ROC AUC on Test Set: {roc_auc_score(y_test, y_pred)}")

Best Score (Recall): 0.952779267691397
Accuracy on Test Set: 0.6771142189918921
Precision on Test Set: 0.27285318559556787
Recall on Test Set: 0.9515908852252442
ROC AUC on Test Set: 0.7947582456531475


In [35]:
with open('./BE/MODEL/model_pipeline.pickle', 'wb') as file:
    pickle.dump(best_logistic_model, file)