In [52]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [53]:
df = pd.read_csv('insurance.csv')

In [54]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
80,56,95.8,1.67,50.0,False,Jalandhar,unemployed,High
51,45,101.9,1.62,28.95,True,Jaipur,private_job,High
97,52,60.8,1.8,44.86,False,Hyderabad,freelancer,Low
26,33,79.0,1.61,23.61,False,Jaipur,freelancer,Medium
6,19,80.1,1.68,3.59,True,Hyderabad,student,Medium


In [55]:
# copy data set 
df_feat = df.copy()

In [56]:
# Feature 1 : BMI 
df_feat['bmi'] = df_feat['weight']/(df['height']**2)

In [57]:
# Feature 2 : Age Group 
def age_group(age):
    if age < 25:
        return 'young'
    elif age < 45:
        return 'adult'
    elif age < 60:
        return 'middle_aged'
    return 'senior'

In [58]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [59]:
# Feature 3: Lifestyle risk 
def lifestyle_risk(row):
    if row['smoker'] and row['bmi'] > 30:
        return 'high'
    elif row['smoker'] or row ['bmi'] > 27:
        return 'medium'
    else:
        return 'low'
    

In [60]:
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle_risk, axis = 1)

In [61]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [62]:
#Feature 4 : city tier 
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3


In [63]:
df_feat['city_tier'] = df_feat['city'].apply(city_tier)

In [64]:
df_feat.drop(columns=['age', 'weight', 'height', 'city', 'smoker'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)
     

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
61,24.05,unemployed,36.281179,adult,high,2,High
40,40.19,unemployed,24.349609,adult,medium,1,Medium
71,20.25,unemployed,16.513537,adult,low,2,Low
4,3.94,retired,24.296875,senior,medium,2,High
76,1.12,retired,44.044444,senior,medium,2,High


In [73]:
# Select feature and target 
X = df_feat[['bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'income_lpa', 'occupation']]
y = df['insurance_premium_category']

In [80]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [75]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [76]:
# define categorical and numerical feature 
categorical_feature = ['age_group', 'lifestyle_risk', 'occupation', 'city_tier']
numerical_feature = ['bmi', 'income_lpa']

In [77]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_feature),
        ('num', 'passthrough', numerical_feature)
    ]
)

In [78]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ('preprocessr', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [79]:
# Split data Train model 
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [81]:
Pipeline(steps=[('preprocessor',ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['age_group','lifestyle_risk','occupation', 'city_tier']),
                                                 ('num', 'passthrough',
                                                  ['bmi', 'income_lpa'])])),
                ('classifier', RandomForestClassifier(random_state=42))])

In [82]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [83]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
81,31.866055,adult,high,2,22.19,freelancer
10,22.949982,adult,medium,1,32.78,business_owner
82,17.874812,adult,low,1,12.96,unemployed
39,35.643424,middle_aged,high,1,11.99,unemployed
36,21.713266,senior,low,1,0.53,retired


In [84]:

import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)
