In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score


In [58]:
df = pd.read_json("~/Data sets/realistic_patient_data.json")

In [59]:
df

Unnamed: 0,age,weight,height,smoker,income_lpa,city,occupation,insurance_premium
0,33,51.2,1.67,True,6.07,Kolkata,Civil Servant,medium
1,21,58.7,1.73,True,40.48,Hubli,Startup Founder,low
2,77,84.7,1.63,True,6.21,Lucknow,Nurse,high
3,21,48.6,1.48,False,10.46,Kolkata,Electrician,low
4,72,69.2,1.78,True,5.69,Patna,Data Scientist,medium
...,...,...,...,...,...,...,...,...
995,52,59.7,1.62,True,4.21,Pune,Construction Worker,medium
996,25,96.4,1.60,False,9.60,Jodhpur,Craftsperson,medium
997,44,64.9,1.68,False,3.06,Patna,Farmer,low
998,68,65.1,1.75,False,13.55,Bengaluru,Artist,low


In [60]:
df_feat = df.copy()

In [61]:
# df_feat["height"] = round(df_feat["height"]/100,2)

In [62]:
df_feat

Unnamed: 0,age,weight,height,smoker,income_lpa,city,occupation,insurance_premium
0,33,51.2,1.67,True,6.07,Kolkata,Civil Servant,medium
1,21,58.7,1.73,True,40.48,Hubli,Startup Founder,low
2,77,84.7,1.63,True,6.21,Lucknow,Nurse,high
3,21,48.6,1.48,False,10.46,Kolkata,Electrician,low
4,72,69.2,1.78,True,5.69,Patna,Data Scientist,medium
...,...,...,...,...,...,...,...,...
995,52,59.7,1.62,True,4.21,Pune,Construction Worker,medium
996,25,96.4,1.60,False,9.60,Jodhpur,Craftsperson,medium
997,44,64.9,1.68,False,3.06,Patna,Farmer,low
998,68,65.1,1.75,False,13.55,Bengaluru,Artist,low


In [63]:
df_feat["occupation"].unique()

array(['Civil Servant', 'Startup Founder', 'Nurse', 'Electrician',
       'Data Scientist', 'Cab Driver', 'Farmer', 'Teacher',
       'Software Engineer', 'Architect', 'Artist', 'Doctor',
       'Construction Worker', 'Fitness Trainer', 'Banker', 'Craftsperson',
       'Delivery Executive', 'Journalist', 'Mechanic', 'Shopkeeper'],
      dtype=object)

In [64]:
df_feat["BMI"] = round(df_feat["weight"]/(df_feat["height"]**2),2)

In [65]:
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    else:
        return "senior"
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [66]:
def lifestyle_risk(row):
    if row["smoker"] and row["BMI"] > 30:
        return "high"
    elif row["smoker"] or row["BMI"] > 27:
        return "medium"
    else:
        return "low"

df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk,axis=1)

In [67]:
df_feat['city'].unique()

array(['Kolkata', 'Hubli', 'Lucknow', 'Patna', 'Vadodara', 'Chennai',
       'Vijayawada', 'Raipur', 'Mumbai', 'Nagpur', 'Ahmedabad', 'Udaipur',
       'Delhi', 'Bengaluru', 'Jaipur', 'Hyderabad', 'Ranchi', 'Pune',
       'Jodhpur', 'Mysuru', 'Surat', 'Indore', 'Bhopal', 'Guwahati'],
      dtype=object)

In [68]:
tier_1 = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Bhagyanagar", "Pune"]
tier_2 = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Prayagraj"
]

In [69]:
def city_tier(city):
    if city in tier_1:
        return 1
    elif city in tier_2:
        return 2
    else:
        return 3

df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [70]:
df_feat = df_feat.drop(columns=["age","weight","height","smoker","city"])
df_feat.sample(15)

Unnamed: 0,income_lpa,occupation,insurance_premium,BMI,age_group,lifestyle_risk,city_tier
770,12.73,Architect,medium,31.43,middle_aged,medium,2
846,4.15,Banker,medium,31.64,adult,medium,2
172,3.69,Construction Worker,low,29.69,adult,medium,2
131,4.1,Doctor,low,15.69,middle_aged,low,3
765,30.51,Journalist,low,19.29,senior,low,3
874,5.28,Shopkeeper,low,28.98,middle_aged,medium,1
271,10.92,Architect,high,36.86,adult,high,2
499,15.89,Teacher,medium,30.82,adult,medium,3
492,7.03,Delivery Executive,low,21.12,senior,low,3
291,7.26,Delivery Executive,low,21.25,senior,low,2


In [71]:
df_feat.sample(15)

Unnamed: 0,income_lpa,occupation,insurance_premium,BMI,age_group,lifestyle_risk,city_tier
497,3.87,Journalist,medium,21.26,senior,medium,3
528,8.61,Cab Driver,low,19.95,middle_aged,low,2
513,9.67,Data Scientist,low,28.07,young,medium,3
609,17.17,Teacher,low,20.08,middle_aged,low,2
332,17.1,Fitness Trainer,low,26.61,middle_aged,low,3
202,10.95,Software Engineer,medium,35.0,middle_aged,medium,1
280,10.13,Banker,low,21.13,young,low,1
994,10.38,Mechanic,medium,28.4,adult,medium,3
134,11.5,Artist,medium,27.37,senior,medium,3
290,3.35,Teacher,medium,22.39,young,medium,3


In [72]:
#OnehotEncoding
trf1 = ColumnTransformer([('Encode_categorical_columns',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,3,4,5])],remainder='passthrough')

In [73]:
trf2 = RandomForestClassifier(random_state=42)

In [74]:
pipeline = Pipeline(steps=[("trf1",trf1),("trf2",trf2)])

In [76]:
X = df_feat.drop(columns="insurance_premium")
y = df_feat["insurance_premium"]

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [78]:
pipeline.fit(X_train,y_train)

In [79]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.89

In [80]:
import pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path,"wb") as f:
    pickle.dump(pipeline, f)



In [81]:
y_pred


array(['high', 'medium', 'medium', 'low', 'low', 'medium', 'low',
       'medium', 'low', 'low', 'low', 'low', 'low', 'low', 'high', 'low',
       'medium', 'medium', 'medium', 'medium', 'medium', 'low', 'low',
       'medium', 'low', 'high', 'high', 'low', 'high', 'low', 'low',
       'high', 'medium', 'low', 'medium', 'medium', 'medium', 'low',
       'low', 'high', 'medium', 'low', 'medium', 'low', 'medium', 'low',
       'medium', 'low', 'low', 'medium', 'low', 'low', 'low', 'low',
       'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'low',
       'medium', 'low', 'low', 'medium', 'medium', 'low', 'low', 'low',
       'medium', 'low', 'high', 'medium', 'low', 'low', 'low', 'low',
       'low', 'medium', 'medium', 'medium', 'medium', 'low', 'medium',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'medium', 'high', 'medium', 'low', 'low', 'low', 'high', 'medium',
       'low', 'low', 'medium', 'medium', 'medium', 'medium', 'medium',
       'm

In [82]:
y_test

507      high
818    medium
452       low
368       low
242       low
        ...  
430       low
874       low
550    medium
608       low
207       low
Name: insurance_premium, Length: 200, dtype: object

In [83]:
X

Unnamed: 0,income_lpa,occupation,BMI,age_group,lifestyle_risk,city_tier
0,6.07,Civil Servant,18.36,adult,medium,1
1,40.48,Startup Founder,19.61,young,medium,3
2,6.21,Nurse,31.88,senior,high,2
3,10.46,Electrician,22.19,young,low,1
4,5.69,Data Scientist,21.84,senior,medium,2
...,...,...,...,...,...,...
995,4.21,Construction Worker,22.75,middle_aged,medium,1
996,9.60,Craftsperson,37.66,adult,medium,2
997,3.06,Farmer,22.99,adult,low,2
998,13.55,Artist,21.26,senior,low,3
