In [2]:
import numpy as np
import pandas as pd
import random

In [3]:
# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

In [13]:
# Helper function to generate insurance type
def generate_insurance_type(risk_score):
    if risk_score > 0.8:
        return random.choice(['life', 'health', 'home'])
    elif risk_score > 0.5:
        return random.choice(['vehicle', 'health'])
    else:
        return random.choice(['life', 'vehicle'])

In [14]:
# Helper function to generate policy recommendation based on preferences and risk score
def generate_policy_recommendation(preference, risk_score):
    if 'comprehensive' in preference:
        return 'full coverage' if risk_score > 0.6 else 'standard coverage'
    elif 'budget' in preference:
        return 'basic coverage' if risk_score <= 0.6 else 'standard coverage'
    else:
        return 'standard coverage'

In [15]:
# Number of records
n_records = 10000

In [16]:
# Generating dataset
data = {
    'customer_id': np.arange(1, n_records + 1),
    'age': np.random.randint(18, 70, size=n_records),
    'income': np.random.randint(30000, 150000, size=n_records),
    'occupation': np.random.choice(['Engineer', 'Teacher', 'Doctor', 'Lawyer', 'Business', 'Freelancer'], size=n_records),
    'marital_status': np.random.choice(['Single', 'Married', 'Divorced'], size=n_records),
    'children': np.random.randint(0, 4, size=n_records),
    'risk_score': np.round(np.random.rand(n_records), 2),
    'customer_preferences': np.random.choice(['budget-friendly', 'comprehensive coverage', 'balanced'], size=n_records)
}

In [17]:
# Calculating insurance type based on risk score
data['insurance_type'] = [generate_insurance_type(score) for score in data['risk_score']]

In [18]:
# Calculating estimated premium based on income and risk score
data['insurance_premium'] = np.round(np.random.uniform(200, 2000, size=n_records) * (1 + data['risk_score']), 2)

In [19]:
# Calculating policy recommendation based on preferences and risk score
data['policy_recommendation'] = [generate_policy_recommendation(preference, score) for preference, score in zip(data['customer_preferences'], data['risk_score'])]

In [20]:
# Generating purchase probability based on risk score and preferences
data['purchase_probability'] = np.round(data['risk_score'] * np.random.uniform(0.5, 1.0, size=n_records), 2)

In [21]:
# Create DataFrame
insurance_df = pd.DataFrame(data)

In [22]:
# Save the dataset to a CSV file
insurance_df.to_csv('personalized_insurance_dataset.csv', index=False)

In [4]:
df = pd.read_csv('personalized_insurance_dataset.csv')

In [5]:
# Display first few records
df.head()

Unnamed: 0,customer_id,age,income,occupation,marital_status,children,risk_score,customer_preferences,insurance_type,insurance_premium,policy_recommendation,purchase_probability
0,1,29,74993,Engineer,Divorced,2,0.96,comprehensive coverage,home,1766.7,full coverage,0.88
1,2,60,106970,Teacher,Single,2,0.17,budget-friendly,vehicle,2155.98,basic coverage,0.16
2,3,43,56649,Freelancer,Divorced,0,0.95,budget-friendly,life,1838.71,standard coverage,0.51
3,4,45,57859,Engineer,Divorced,0,0.17,budget-friendly,life,1049.69,basic coverage,0.16
4,5,45,124862,Lawyer,Married,0,0.84,budget-friendly,home,3408.67,standard coverage,0.71


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [7]:
# Encode categorical variables for model input
le_occupation = LabelEncoder()
le_marital_status = LabelEncoder()
le_preferences = LabelEncoder()
le_insurance_type = LabelEncoder()

In [8]:
df['occupation_encoded'] = le_occupation.fit_transform(df['occupation'])
df['marital_status_encoded'] = le_marital_status.fit_transform(df['marital_status'])
df['preferences_encoded'] = le_preferences.fit_transform(df['customer_preferences'])
df['insurance_type_encoded'] = le_insurance_type.fit_transform(df['insurance_type'])

In [9]:
# Features and target variable
X = df[['age', 'income', 'occupation_encoded', 'marital_status_encoded', 'children', 'risk_score', 'preferences_encoded']]
y = df['insurance_type_encoded']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [11]:
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf_model.fit(X_train, y_train)

In [12]:
y_pred = rf_model.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 46.77%


In [14]:
feature_importance = pd.DataFrame(rf_model.feature_importances_, index = X.columns, columns = ['importance']).sort_values('importance', ascending = False)

In [15]:
feature_importance

Unnamed: 0,importance
risk_score,0.393395
income,0.221602
age,0.173846
occupation_encoded,0.074517
children,0.053872
marital_status_encoded,0.042495
preferences_encoded,0.040273


In [33]:
import joblib

In [34]:
joblib.dump(rf_model, 'insurance_product.pkl')

['insurance_product.pkl']

In [35]:
rf_model = joblib.load('insurance_product.pkl')

In [36]:
new_customer = pd.DataFrame({
    'age': [56],
    'income': [90000],
    'occupation_encoded': [le_occupation.transform(['Freelancer'])[0]],
    'marital_status_encoded': [le_marital_status.transform(['Married'])[0]],
    'children': [1],
    'risk_score': [0.43],
    'preferences_encoded': [le_preferences.transform(['comprehensive coverage'])[0]]
})

In [37]:
predicted_policy_encoded = rf_model.predict(new_customer)

In [38]:
predicted_policy = le_insurance_type.inverse_transform(predicted_policy_encoded)

In [39]:
predicted_policy

array(['vehicle'], dtype=object)

In [17]:
param_grid = {
    'n_estimators': [100, 200, 300],            # Number of trees in the forest
    'max_depth': [10, 20, 30, None],            # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],            # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],              # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                  # Whether to use bootstrap samples when building trees
}

In [18]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

In [19]:
# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

In [None]:
# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
print(f"Best Hyperparameters: {best_params}")
print(f"Best Grid Search Accuracy: {best_score * 100:.2f}%")