In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [3]:
df = pd.read_csv('./insurance.csv')

In [5]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
11,25,77.2,1.56,10.899387,True,Pune,government_job,Low
85,33,51.4,1.86,34.66,False,Chennai,private_job,Low
22,57,106.4,1.83,30.0,False,Chandigarh,government_job,Low
36,61,58.4,1.64,0.53,False,Hyderabad,retired,Medium
55,47,75.7,1.73,24.93,False,Delhi,unemployed,Low


In [9]:
print("Dataset Size:", df.shape)

Dataset Size: (100, 8)


In [10]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

Feature Engineering

In [12]:
df_features = df.copy()
print(df_features.shape)
print(df_features.columns)
print(df_features.sample(5))

(100, 8)
Index(['age', 'weight', 'height', 'income_lpa', 'smoker', 'city', 'occupation',
       'insurance_premium_category'],
      dtype='object')
    age  weight  height  income_lpa  smoker        city      occupation  \
57   72    76.8    1.69        1.36    True   Jalandhar         retired   
61   32   102.4    1.68       24.05    True  Chandigarh      unemployed   
5    53    62.9    1.66       50.00   False        Kota      freelancer   
42   23    69.9    1.79        2.60    True      Mysore         student   
20   34    58.2    1.85       30.65    True        Gaya  business_owner   

   insurance_premium_category  
57                       High  
61                       High  
5                      Medium  
42                     Medium  
20                     Medium  


In [13]:
# Feature 1: BMI
df_features['bmi'] = df['weight'] / (df['height'] ** 2)
print(df_features['bmi'].sample(5))

62    21.738481
57    26.889815
27    35.159702
75    20.577355
11    31.722551
Name: bmi, dtype: float64


In [14]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [16]:
df_features["age_group"] = df["age"].apply(age_group)
print(df_features['age_group'].sample(5))

2      adult
12     adult
33    senior
95     adult
60     adult
Name: age_group, dtype: object


In [17]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [19]:
df_features["lifestyle_risk"] = df_features.apply(lifestyle_risk, axis=1)
print(df_features['lifestyle_risk'].sample(5))

60      high
59    medium
47    medium
48      high
49    medium
Name: lifestyle_risk, dtype: object


In [20]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [21]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [24]:
df_features["city_tier"] = df["city"].apply(city_tier)
print(df_features['city_tier'].sample(5))

27    2
48    1
39    1
29    2
38    2
Name: city_tier, dtype: int64


In [25]:
df_features.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
32,47,113.7,1.9,50.0,False,Jalandhar,private_job,Medium,31.495845,middle_aged,medium,2
27,58,111.4,1.78,34.33,False,Lucknow,private_job,Medium,35.159702,middle_aged,medium,2
68,20,80.3,1.87,0.68,False,Lucknow,student,Low,22.963196,young,low,2
26,33,79.0,1.61,23.61,False,Jaipur,freelancer,Medium,30.477219,adult,medium,2
62,34,72.8,1.83,35.67,False,Chennai,business_owner,Low,21.738481,adult,low,1


In [26]:
df_features.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
10,32.78,business_owner,22.949982,adult,medium,1,Medium
89,4.0,student,30.458274,young,medium,1,Low
4,3.94,retired,24.296875,senior,medium,2,High
3,3.34,student,45.5359,young,high,1,Medium
24,18.6,private_job,19.669038,middle_aged,low,2,Medium


In [32]:
# Select features and target
X = df_features[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
Y = df_features["insurance_premium_category"]

In [33]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [35]:
Y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [36]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [37]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [38]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [42]:
# Split data and train model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
pipeline.fit(X_train, Y_train)

In [45]:
# Predict and evaluate
Y_pred = pipeline.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.9

In [46]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
39,35.643424,middle_aged,high,1,11.99,unemployed
93,23.199416,young,low,2,1.28,student
69,21.942857,middle_aged,low,2,6.034487,government_job
78,27.932798,middle_aged,medium,2,14.74,freelancer
10,22.949982,adult,medium,1,32.78,business_owner


In [47]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)