In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


In [3]:
df = pd.read_csv('insurance.csv')

In [4]:
df.sample(10)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
76,62,99.1,1.5,1.12,False,Mysore,retired,High
96,26,113.8,1.54,34.01,False,Delhi,private_job,Low
32,47,113.7,1.9,50.0,False,Jalandhar,private_job,Medium
84,75,86.2,1.73,0.62,True,Jaipur,retired,High
25,59,60.2,1.55,30.0,False,Mysore,government_job,Low
46,42,83.0,1.57,25.57,True,Kolkata,unemployed,High
9,58,74.4,1.73,43.07,False,Pune,business_owner,Low
31,39,51.1,1.83,11.77,True,Lucknow,private_job,Medium
35,59,59.3,1.69,43.28,True,Chandigarh,private_job,Medium
27,58,111.4,1.78,34.33,False,Lucknow,private_job,Medium


**Feature Engineering**


In [5]:
feat = df.copy()

In [6]:
feat.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
26,33,79.0,1.61,23.61,False,Jaipur,freelancer,Medium
12,42,95.2,1.78,17.58,True,Chandigarh,freelancer,High
35,59,59.3,1.69,43.28,True,Chandigarh,private_job,Medium
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
93,23,79.4,1.85,1.28,False,Indore,student,Low


In [7]:
#Feature 01
feat["bmi"] = (feat["weight"]) / (feat["height"] ** 2)
feat["bmi"].sample(5)

94    33.266002
62    21.738481
82    17.874812
34    32.914286
24    19.669038
Name: bmi, dtype: float64

In [8]:
#Feature 02
def age_group(age):
    if age < 25:
        return "Young"
    elif age > 40:
        return "Middle Age"
    elif 25 < age <59:
        return "Adult"
    return "Senior"

In [9]:
feat["age_group"] = feat["age"].apply(age_group)
feat["age_group"]

0     Middle Age
1          Adult
2          Adult
3          Young
4     Middle Age
         ...    
95         Adult
96         Adult
97    Middle Age
98         Adult
99         Adult
Name: age_group, Length: 100, dtype: object

In [10]:
#Feature 03
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "High"
    elif row["smoker"] or row["bmi"] > 27:
        return  " Medium"
    else:
        return "Low"

In [11]:
feat["lifestyle"] = feat.apply(lifestyle_risk,axis=1)
feat["lifestyle"]

0      Medium
1      Medium
2         Low
3        High
4      Medium
       ...   
95        Low
96     Medium
97        Low
98     Medium
99     Medium
Name: lifestyle, Length: 100, dtype: object

In [12]:
df["city"].unique()

array(['Jaipur', 'Chennai', 'Indore', 'Mumbai', 'Kota', 'Hyderabad',
       'Delhi', 'Chandigarh', 'Pune', 'Kolkata', 'Lucknow', 'Gaya',
       'Jalandhar', 'Mysore', 'Bangalore'], dtype=object)

In [13]:
tier_1 = [
    "Chennai",
    "Mumbai",
    "Hyderabad",
    "Delhi",
    "Pune",
    "Kolkata",
    "Bangalore"
]

tier_2 = [
    "Jaipur",
    "Indore",
    "Kota",
    "Chandigarh",
    "Lucknow",
    "Gaya",
    "Jalandhar",
    "Mysore"
]


In [14]:
def city_tier(city):
    if city in tier_1:
        return 1
    elif city in tier_2:
        return 2
    else:
        return 3

In [15]:
feat["city_tier"] = feat["city"].apply(city_tier)
feat["city_tier"]

0     2
1     1
2     2
3     1
4     2
     ..
95    2
96    1
97    1
98    1
99    1
Name: city_tier, Length: 100, dtype: int64

In [16]:
feat.drop(columns=["age","height","weight","smoker","city"])

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,lifestyle,city_tier
0,2.92000,retired,High,49.227482,Middle Age,Medium,2
1,34.28000,freelancer,Low,30.189017,Adult,Medium,1
2,36.64000,freelancer,Low,21.118382,Adult,Low,2
3,3.34000,student,Medium,45.535900,Young,High,1
4,3.94000,retired,High,24.296875,Middle Age,Medium,2
...,...,...,...,...,...,...,...
95,19.64000,business_owner,Low,21.420747,Adult,Low,2
96,34.01000,private_job,Low,47.984483,Adult,Medium,1
97,44.86000,freelancer,Low,18.765432,Middle Age,Low,1
98,28.30000,business_owner,Low,30.521676,Adult,Medium,1


In [17]:
feat

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle,city_tier
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,Middle Age,Medium,2
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,Adult,Medium,1
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,Adult,Low,2
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,High,1
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,Middle Age,Medium,2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,Adult,Low,2
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,Adult,Medium,1
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,Middle Age,Low,1
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,Adult,Medium,1


**Split X & Y**

In [18]:
X = feat[["income_lpa","occupation","bmi","age_group","lifestyle","city_tier"]]
y = feat["insurance_premium_category"]

In [19]:
X

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle,city_tier
0,2.92000,retired,49.227482,Middle Age,Medium,2
1,34.28000,freelancer,30.189017,Adult,Medium,1
2,36.64000,freelancer,21.118382,Adult,Low,2
3,3.34000,student,45.535900,Young,High,1
4,3.94000,retired,24.296875,Middle Age,Medium,2
...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,Adult,Low,2
96,34.01000,private_job,47.984483,Adult,Medium,1
97,44.86000,freelancer,18.765432,Middle Age,Low,1
98,28.30000,business_owner,30.521676,Adult,Medium,1


In [20]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [21]:
feat

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle,city_tier
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High,49.227482,Middle Age,Medium,2
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low,30.189017,Adult,Medium,1
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low,21.118382,Adult,Low,2
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium,45.535900,Young,High,1
4,69,62.2,1.60,3.94000,True,Indore,retired,High,24.296875,Middle Age,Medium,2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low,21.420747,Adult,Low,2
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low,47.984483,Adult,Medium,1
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low,18.765432,Middle Age,Low,1
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low,30.521676,Adult,Medium,1


In [22]:
#Define categorical & Numerical features

categorical_feat = ["occupation","age_group","lifestyle"]
numerical_feat = ["income_lpa","bmi","city_tier"]

In [23]:
preprocessor = ColumnTransformer(transformers=[
                                ("cat",OneHotEncoder(),categorical_feat),
                                ("num","passthrough",numerical_feat)
                                ]
                )

#(name, transformer, columns)


In [24]:
pipeline=Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("classifier",RandomForestClassifier(random_state=42))
])

#(name, object)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [26]:
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
y_pred=pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.9

In [28]:
X_test.sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle,city_tier
82,12.96,unemployed,17.874812,Adult,Low,1
84,0.62,retired,28.801497,Middle Age,Medium,2
36,0.53,retired,21.713266,Middle Age,Low,1
31,11.77,private_job,15.258742,Adult,Medium,2
10,32.78,business_owner,22.949982,Adult,Medium,1


In [29]:
import pickle

pickle_model_path = "model.pkl"

with open(pickle_model_path,"wb" ) as f:
    pickle.dump(pipeline,f)

In [33]:
feat["weight"].sample(4)

88     97.4
83     83.3
60    101.3
24     54.2
Name: weight, dtype: float64