In [3]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os


In [8]:
os.getcwd()

'c:\\Users\\Nitish\\Desktop\\Coding\\FastAPI\\Machine_Learning'

In [10]:
df = pd.read_csv("Insurance_Premium_Data.csv")

In [11]:
df.shape

(200, 8)

In [12]:
# Work on a feature copy
df_feat = df.copy()

In [13]:
# --- Feature 1: BMI ---
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

# --- Feature 2: Age Group ---
def age_group(age: int) -> str:
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

df_feat["age_group"] = df_feat["age"].apply(age_group)

In [14]:
# --- Feature 3: Lifestyle Risk ---
def lifestyle_risk(row) -> str:
    if bool(row["smoker"]) and row["bmi"] > 30:
        return "high"
    elif bool(row["smoker"]) or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)


In [15]:
# --- US city tiers (rough/population-based buckets) ---
tier_1_cities = {
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
    "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
    "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte",
    "San Francisco", "Indianapolis", "Seattle", "Denver", "Washington",
    "Boston"
}

tier_2_cities = {
    "Nashville", "El Paso", "Detroit", "Oklahoma City", "Portland",
    "Las Vegas", "Memphis", "Louisville", "Baltimore", "Milwaukee",
    "Albuquerque", "Tucson", "Fresno", "Sacramento", "Mesa",
    "Kansas City", "Atlanta", "Omaha", "Colorado Springs", "Raleigh",
    "Miami", "Long Beach", "Virginia Beach", "Oakland", "Minneapolis",
    "Tulsa", "Arlington", "Tampa", "New Orleans", "Wichita",
    "Cleveland", "Bakersfield", "Aurora", "Anaheim", "Honolulu",
    "Henderson", "Riverside", "Corpus Christi", "Lexington",
    "Stockton", "Hialeah", "Anchorage", "Plano", "Greensboro"
}

In [16]:
def city_tier(city: str) -> int:
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [17]:
# --- Select features/target ---
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]


In [18]:
# --- Preprocessing: categorical & numeric ---
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

In [19]:
# --- Pipeline with RandomForest ---
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


In [20]:
# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)
pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
# --- Evaluate ---
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.625
              precision    recall  f1-score   support

        High       0.75      0.43      0.55         7
         Low       0.54      0.64      0.58        11
      Medium       0.65      0.68      0.67        22

    accuracy                           0.62        40
   macro avg       0.65      0.58      0.60        40
weighted avg       0.64      0.62      0.62        40



In [22]:
# --- Persist trained pipeline ---
with open("model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("Saved model to model.pkl")

Saved model to model.pkl
