In [22]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,accuracy_score

In [23]:
df = pd.read_csv('insurance.csv')
data = df.copy()

In [24]:
data['bmi'] = round(data['weight'] / (data['height']**2),3)

In [25]:
def age_group(age):
    if age <25:
        return "young"
    elif age<45:
        return "adult"
    elif age <60:
        return "middle_age"
    return "senior"

In [26]:
data['age_group'] = data['age'].apply(age_group)

In [27]:
def life_style_risk(data):
    if data['smoker'] and data['bmi'] >30:
        return "high"
    elif data['smoker'] or data['bmi'] > 27:
        return "medium"
    else:
        return "low"

In [28]:
data['life_style_risk'] = data.apply(life_style_risk,axis=1)

In [29]:
data['city'].unique()

array(['Jaipur', 'Chennai', 'Indore', 'Mumbai', 'Kota', 'Hyderabad',
       'Delhi', 'Chandigarh', 'Pune', 'Kolkata', 'Lucknow', 'Gaya',
       'Jalandhar', 'Mysore', 'Bangalore'], dtype=object)

In [30]:
data['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [31]:
tier1 = ['Mumbai','Delhi','Bangalore','Chennai','Kolkata','Hyderabad','Pune']
tier2 = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [32]:
def city_tire(city):
    if city in tier1: return 1
    elif city in tier2: return 2
    else : return 3

In [33]:
data['city_tier'] = data['city'].apply(city_tire)

In [34]:
data.drop(columns=['age', 'weight', 'height','smoker', 'city'],inplace=True)

In [35]:
X = data[["bmi", "age_group", "life_style_risk", "city_tier", "income_lpa", "occupation"]]
y = data['insurance_premium_category']

In [36]:
cat_data =  ["age_group", "life_style_risk", "occupation", "city_tier"]
num_data = ["bmi", "income_lpa"]

In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_data',OneHotEncoder(),cat_data),
        ('num_data',StandardScaler(),num_data)
    ]
)

In [38]:
pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

In [39]:
X.dtypes

bmi                float64
age_group           object
life_style_risk     object
city_tier            int64
income_lpa         float64
occupation          object
dtype: object

In [40]:
X_train,X_test ,y_train ,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
pipeline.fit(X_train,y_train)

In [41]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [42]:
import pickle
# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)