
# Task 4 — Predicting Insurance Claim Amounts

**Objective:** Train Linear Regression to estimate medical insurance charges.  
**Metrics:** MAE and RMSE.  
**Visuals:** BMI, age, smoking status vs. charges.


In [None]:

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

data_path_options = [
    Path('data/insurance.csv'),
    Path('../data/insurance.csv'),
    Path('/mnt/data/devhub_ds_tasks/data/insurance.csv'),
]


In [None]:

# Load dataset (expects insurance.csv in ./data/). If missing, print a helpful message.
df = None
for p in data_path_options:
    if p.exists():
        df = pd.read_csv(p)
        break
if df is None:
    raise FileNotFoundError("Place 'insurance.csv' under ./data/ and re-run.")
print("Shape:", df.shape)
display(df.head())
display(df.describe(include='all'))
print("\nNulls per column:\n", df.isna().sum())


In [None]:

# Visuals
fig = plt.figure(); plt.scatter(df['bmi'], df['charges'], alpha=0.7)
plt.xlabel("BMI"); plt.ylabel("Charges"); plt.title("BMI vs Charges"); plt.show()

fig = plt.figure(); plt.scatter(df['age'], df['charges'], alpha=0.7)
plt.xlabel("Age"); plt.ylabel("Charges"); plt.title("Age vs Charges"); plt.show()

fig = plt.figure()
x = (df['smoker']=='yes').astype(int) + (np.random.rand(len(df))-0.5)*0.1
plt.scatter(x, df['charges'], alpha=0.5)
plt.xticks([0,1], ['no','yes']); plt.xlabel("Smoker"); plt.ylabel("Charges")
plt.title("Smoking Status vs Charges"); plt.show()


In [None]:

# Preprocess & Train
X = df.drop(columns=['charges'])
y = df['charges']
numeric = X.select_dtypes(include=[np.number]).columns.tolist()
categorical = X.select_dtypes(exclude=[np.number]).columns.tolist()

pre = ColumnTransformer([
    ('num', StandardScaler(), numeric),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
])

model = Pipeline([('pre', pre), ('lr', LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
print("MAE:", round(mae,2), "| RMSE:", round(rmse,2))


In [None]:

# Coefficient magnitudes (proxy importance)
lr = model.named_steps['lr']
pre = model.named_steps['pre']
num_features = pre.transformers_[0][2]
cat_encoder = pre.transformers_[1][1]
cat_features = cat_encoder.get_feature_names_out(pre.transformers_[1][2])
feature_names = list(num_features) + list(cat_features)
import pandas as pd, numpy as np
coefs = pd.Series(np.abs(lr.coef_), index=feature_names).sort_values(ascending=False)
coefs.head(10)



# Conclusion

- **Signal:** Charges rise with **age** and **BMI**; **smoking** corresponds to much higher costs.  
- **Model:** Linear Regression with scaling + OHE gives reasonable **MAE/RMSE**.  
- **Interpretation:** Largest coefficients (post-scaling) align with **smoker**, **age**, **BMI**.  
- **Next:** Try non-linear models and interactions; consider log(`charges`) for error stability.
