In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn .preprocessing import OneHotEncoder
from sklearn .pipeline import Pipeline
from sklearn .compose import ColumnTransformer
from sklearn .metrics import accuracy_score,classification_report
import numpy as np

In [2]:
data=pd.read_csv(r"C:\Users\mdfir\Downloads\insurance.csv")

In [3]:
data.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [4]:
data_feat=data.copy()

# feature 1:BMI

In [6]:
data_feat["bmi"]=data_feat["weight"]/(data_feat["height"]**2)

# feature 2: age group 

In [8]:
def age_group(age):
    if age<25:
        return "young"
    elif age<45:
        return "adult"
    elif age<60:
        return "middle_aged"
    return "senior"

In [9]:
data_feat["age_group"]=data_feat["age"].apply(age_group)

# feature 3: liferisk

In [11]:
def life_risk(row):
    if row["smoker"] and row["bmi"]>30:
        return "high"
    elif row["smoker"] or row["bmi"]>27:
         return "medium"
    else:
        return "low"
    

In [12]:
data_feat["life_risk"]=data_feat.apply(life_risk,axis=1)

In [13]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [14]:
# feature 4 : city  tier

In [15]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3
        

In [16]:
data_feat["city_tier"]=data["city"].apply(city_tier)

In [28]:
data_feat.drop(columns=["age","weight","height","smoker","city"])[['income_lpa', 'occupation', 'bmi', 'age_group', 'life_risk', 'city_tier', 'insurance_premium_category']].sample(5)
     

Unnamed: 0,income_lpa,occupation,bmi,age_group,life_risk,city_tier,insurance_premium_category
41,3.87,retired,22.507433,senior,low,1,Medium
51,28.95,private_job,38.827923,middle_aged,high,2,High
52,2.96,student,47.34472,young,medium,2,Medium
31,11.77,private_job,15.258742,adult,medium,2,Medium
93,1.28,student,23.199416,young,low,2,Low


In [40]:

# Select features and target
X = data_feat[["bmi", "age_group", "life_risk", "city_tier", "income_lpa", "occupation"]]
y = data_feat["insurance_premium_category"]
     
 

In [50]:
categocial_feature=["age_group","life_risk","occupation","city_tier"]
numerical_feature=["bmi","income_lpa"]

In [52]:
step1=ColumnTransformer([
    ("cat",OneHotEncoder(),categocial_feature),
    ("num","passthrough",numerical_feature)
])

In [54]:
pipeline=Pipeline(steps=[

("step1",step1),
    ("classifier",RandomForestClassifier(random_state=42))
    
])

In [56]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=1)

In [60]:
pipeline.fit(x_train,y_train)

0,1,2
,steps,"[('step1', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [62]:
y_pred=pipeline.predict(x_test)

In [64]:
score=accuracy_score(y_pred,y_test)

In [66]:
score

0.9

In [68]:
import pickle 

In [72]:
file="modelapi.pkl"

In [74]:
with open (file,"wb") as f:
    pickle.dump(pipeline,f)