In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
df = pd.read_csv('/content/insurance.csv')

In [3]:
df.sample(5)

Unnamed: 0,age,weight_kg,height_cm,income_lpa,smoker,city,occupation,insurance_premium_category
138,67,52.4,152,55.1,No,Lucknow,Chef,High
33,47,69.5,188,41.9,No,Ahmedabad,Farmer,Medium
77,65,58.0,178,39.3,Yes,Surat,Government Employee,Very High
61,58,82.8,190,52.3,Yes,Kolkata,Police,Very High
101,64,56.7,159,38.8,Yes,Pune,Journalist,Very High


In [17]:
df['city'].unique()

array(['Delhi', 'Patna', 'Hyderabad', 'Mumbai', 'Surat', 'Lucknow',
       'Pune', 'Chennai', 'Kolkata', 'Jaipur', 'Bhopal', 'Bengaluru',
       'Kanpur', 'Ahmedabad', 'Nagpur', 'Indore'], dtype=object)

In [4]:
df['occupation'].unique()

array(['Government Employee', 'Designer', 'Sales Executive',
       'Construction Worker', 'Teacher', 'Journalist', 'Business Owner',
       'IT Professional', 'Accountant', 'Police', 'Farmer', 'Chef'],
      dtype=object)

In [5]:
df_feat = df.copy()

In [12]:
# Feature 1: BMI
df_feat['bmi'] = df_feat['weight_kg']/(df_feat['height_cm']**2)

In [13]:
# Feature 2: Age Group
def age_group(age):
  if age<25:
    return "young"
  elif age<45:
    return "adult"
  elif age<60:
    return "middle_aged"
  return "senior"

In [22]:
df_feat['age_group']=df_feat['age'].apply(age_group)

In [23]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
  if row["smoker"] and row["bmi"] > 30:
    return "high"
  elif row["smoker"] or row["bmi"] > 27:
    return "medium"
  else:
    return "low"

In [24]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [25]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [26]:
# Feature 4: City Tier
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3

In [27]:
df_feat['city_tier'] = df_feat["city"].apply(city_tier)

In [28]:
df_feat.drop(columns=['age', 'weight_kg', 'height_cm', 'smoker', 'city' ])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category' ]].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
15,44.5,IT Professional,0.002562,middle_aged,medium,2,Very High
95,25.4,Journalist,0.00325,middle_aged,medium,1,Very High
8,56.5,Designer,0.003225,middle_aged,medium,1,High
20,44.3,Sales Executive,0.002025,adult,medium,3,High
65,13.9,Construction Worker,0.002482,middle_aged,medium,1,Very High


In [29]:
# Select features and target
X = df_feat[['bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'income_lpa', 'occupation']]
y = df_feat[['insurance_premium_category']]

In [30]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,0.002755,middle_aged,medium,1,5.4,Government Employee
1,0.003103,adult,medium,2,13.2,Designer
2,0.002924,middle_aged,medium,1,31.7,Sales Executive
3,0.002194,young,medium,1,43.2,Sales Executive
4,0.001730,young,medium,2,44.9,Sales Executive
...,...,...,...,...,...,...
201,0.003181,adult,medium,2,39.0,Designer
202,0.002882,senior,medium,2,45.8,Government Employee
203,0.002034,young,medium,1,16.7,Farmer
204,0.003035,adult,medium,1,31.4,Journalist


In [31]:
y

Unnamed: 0,insurance_premium_category
0,Medium
1,Medium
2,High
3,Medium
4,Low
...,...
201,Medium
202,Medium
203,Low
204,High


In [32]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [33]:
# Create column transformer for OHE(One Hot Encodeing)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [34]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42))
    ]
)

In [37]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [38]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.5714285714285714

In [39]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
186,0.002553,middle_aged,medium,2,30.3,Farmer
180,0.002121,young,medium,2,39.8,Farmer
189,0.002293,adult,medium,1,41.9,Construction Worker
145,0.002648,adult,medium,1,16.5,Chef
174,0.001827,young,medium,2,22.8,Construction Worker


In [40]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"

with open(pickle_model_path, "wb") as f:
  pickle.dump(pipeline, f)