In [10]:
import sys
from pathlib import Path

# Find project root by looking for 'src' directory
current_path = Path().resolve()

for parent in [current_path] + list(current_path.parents):
    if (parent / "src").exists():
        sys.path.insert(0, str(parent))
        print(f"Project root added to sys.path: {parent}")
        break
else:
    raise RuntimeError("Could not find project root containing 'src' directory")


Project root added to sys.path: /workspaces/Task_09_Project


In [11]:
import numpy as np
import pandas as pd

from src.data_loader import load_raw_data
from src.preprocessing import clean_data
from src.modeling import train_linear_model
from src.predict import predict_insurance_cost


In [12]:
df_raw = load_raw_data("/workspaces/Task_09_Project/data/insurance.csv")
df_clean = clean_data(df_raw)

df_log = df_clean.copy()
df_log["log_charges"] = np.log(df_log["charges"])

log_model = train_linear_model(
    df_log.drop(columns=["charges"]),
    target="log_charges"
)

log_model.summary()


0,1,2,3
Dep. Variable:,log_charges,R-squared:,0.768
Model:,OLS,Adj. R-squared:,0.767
Method:,Least Squares,F-statistic:,549.8
Date:,"Thu, 29 Jan 2026",Prob (F-statistic):,0.0
Time:,04:58:24,Log-Likelihood:,-808.52
No. Observations:,1338,AIC:,1635.0
Df Residuals:,1329,BIC:,1682.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.0306,0.072,97.112,0.000,6.889,7.173
age,0.0346,0.001,39.655,0.000,0.033,0.036
sex,-0.0754,0.024,-3.091,0.002,-0.123,-0.028
bmi,0.0134,0.002,6.381,0.000,0.009,0.017
children,0.1019,0.010,10.085,0.000,0.082,0.122
smoker,1.5543,0.030,51.333,0.000,1.495,1.614
region_northwest,-0.0638,0.035,-1.827,0.068,-0.132,0.005
region_southeast,-0.1572,0.035,-4.481,0.000,-0.226,-0.088
region_southwest,-0.1290,0.035,-3.681,0.000,-0.198,-0.060

0,1,2,3
Omnibus:,463.882,Durbin-Watson:,2.046
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1673.76
Skew:,1.679,Prob(JB):,0.0
Kurtosis:,7.33,Cond. No.,311.0


In [15]:
coef_df = (
    log_model.params
    .drop("const")
    .to_frame("log_coef")
)

coef_df["percent_change"] = (np.exp(coef_df["log_coef"]) - 1) * 100
coef_df.sort_values("percent_change", key=abs, ascending=False)


Unnamed: 0,log_coef,percent_change
smoker,1.554323,373.188094
region_southeast,-0.157197,-14.546408
region_southwest,-0.128952,-12.098404
children,0.101857,10.722496
sex,-0.075416,-7.264279
region_northwest,-0.063788,-6.179571
age,0.034582,3.518654
bmi,0.013375,1.346467


In [13]:
profiles = {
    "Low Risk (Non-Smoker)": {
        "const": 1, "age": 30, "sex": 0, "bmi": 24,
        "children": 0, "smoker": 0,
        "region_northwest": 0, "region_southeast": 0, "region_southwest": 1
    },
    "Medium Risk": {
        "const": 1, "age": 45, "sex": 1, "bmi": 29,
        "children": 2, "smoker": 0,
        "region_northwest": 1, "region_southeast": 0, "region_southwest": 0
    },
    "High Risk (Smoker)": {
        "const": 1, "age": 50, "sex": 1, "bmi": 33,
        "children": 2, "smoker": 1,
        "region_northwest": 0, "region_southeast": 1, "region_southwest": 0
    }
}

results = []
for label, profile in profiles.items():
    pred = predict_insurance_cost(log_model, profile)
    results.append({
        "Profile": label,
        "Estimated Annual Cost ($)": round(pred["predicted_annual_cost"], 0)
    })

pd.DataFrame(results)


Unnamed: 0,Profile,Estimated Annual Cost ($)
0,Low Risk (Non-Smoker),4403.0
1,Medium Risk,9595.0
2,High Risk (Smoker),51863.0


In [14]:
def cost_band(cost):
    if cost < 8000:
        return "Low"
    elif cost < 20000:
        return "Medium"
    else:
        return "High"

results_df = pd.DataFrame(results)
results_df["Risk Band"] = results_df["Estimated Annual Cost ($)"].apply(cost_band)
results_df


Unnamed: 0,Profile,Estimated Annual Cost ($),Risk Band
0,Low Risk (Non-Smoker),4403.0,Low
1,Medium Risk,9595.0,Medium
2,High Risk (Smoker),51863.0,High


### Model Output Interpretation

The estimated annual insurance cost reflects the specific risk profile provided as input rather than an average individual.  
Smoking status and elevated body mass index are the strongest cost drivers in the model and are associated with significantly higher healthcare expenditures.  
As a result, high estimates indicate increased relative risk rather than a modeling or calculation error.  
When evaluated using lower-risk, non-smoking profiles, the model produces substantially lower and more typical cost estimates.  
Predictions are generated using a log-linear regression corrected for log-scale bias and are intended for illustrative, comparative analysis rather than exact individual forecasting.
