In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib
import os

os.makedirs("models", exist_ok=True)

# Load data

In [None]:
df = pd.read_csv("data/insurance.csv")

# Features and target

In [None]:
X = df.drop("charges", axis=1)
y = df["charges"]

# Identify categorical and numerical columns

In [None]:
cat_cols = ["sex", "smoker", "region"]
num_cols = ["age", "bmi", "children"]

# Preprocessing

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first"), cat_cols)
    ]
)

# Model pipeline

In [None]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train

In [None]:
model.fit(X_train, y_train)

# Save model

In [None]:
joblib.dump(model, "models/severity_model.pkl")
print("Model trained and saved!")