In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib
import streamlit as st

In [22]:
insurance_df = pd.read_csv('personalized_insurance_dataset.csv')

In [23]:
insurance_df.head()

Unnamed: 0,customer_id,age,income,occupation,marital_status,children,risk_score,customer_preferences,insurance_type,insurance_premium,policy_recommendation,purchase_probability
0,1,29,74993,Engineer,Divorced,2,0.96,comprehensive coverage,home,1766.7,full coverage,0.88
1,2,60,106970,Teacher,Single,2,0.17,budget-friendly,vehicle,2155.98,basic coverage,0.16
2,3,43,56649,Freelancer,Divorced,0,0.95,budget-friendly,life,1838.71,standard coverage,0.51
3,4,45,57859,Engineer,Divorced,0,0.17,budget-friendly,life,1049.69,basic coverage,0.16
4,5,45,124862,Lawyer,Married,0,0.84,budget-friendly,home,3408.67,standard coverage,0.71


In [24]:
# Step 2: Encode categorical variables
le = LabelEncoder()
insurance_df['occupation'] = le.fit_transform(insurance_df['occupation'])
insurance_df['marital_status'] = le.fit_transform(insurance_df['marital_status'])
insurance_df['customer_preferences'] = le.fit_transform(insurance_df['customer_preferences'])
insurance_df['insurance_type'] = le.fit_transform(insurance_df['insurance_type'])
insurance_df['policy_recommendation'] = le.fit_transform(insurance_df['policy_recommendation'])

In [26]:
# Step 3: Define features and target variable
X = insurance_df[['age', 'income', 'occupation', 'marital_status', 'children', 'risk_score', 'customer_preferences']]
y = insurance_df['policy_recommendation']

In [27]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Step 5: Initialize the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [32]:
# Step 6: Train and evaluate each model
best_model = None
best_accuracy = 0
model_accuracies = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    model_accuracies[name] = accuracy
    
    #print(f"{name} Accuracy: {accuracy:.4f}")
    #print(classification_report(y_test, y_pred))
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

In [33]:
print(f"\nBest Model: {best_model.__class__.__name__} with accuracy {best_accuracy:.4f}")


Best Model: DecisionTreeClassifier with accuracy 1.0000


In [35]:
# Save the best model to a file
joblib.dump(best_model, 'best_insurance_model.pkl')

['best_insurance_model.pkl']