In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os

os.makedirs("figs", exist_ok=True)

# Load data

In [None]:
df = pd.read_csv("data/insurance.csv")

# Hypothesis 1: Smoker vs Non-Smoker Charges

In [None]:
smoker_charges = df[df["smoker"]=="yes"]["charges"]
nonsmoker_charges = df[df["smoker"]=="no"]["charges"]

t_stat, p_value = stats.ttest_ind(smoker_charges, nonsmoker_charges, equal_var=False)
print(f"T-test for smoker vs non-smoker charges: t={t_stat:.2f}, p={p_value:.4f}")

# Hypothesis 2: Region effect on charges (ANOVA)

In [None]:
region_groups = [df[df["region"]==r]["charges"] for r in df["region"].unique()]
f_stat, p_value = stats.f_oneway(*region_groups)
print(f"ANOVA for region effect on charges: F={f_stat:.2f}, p={p_value:.4f}")

# Visualizations

# Charges by smoker

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x="smoker", y="charges", data=df)
plt.title("Charges by Smoker Status")
plt.tight_layout()
plt.savefig("figs/charges_by_smoker.png", dpi=150)
plt.close()

# Charges by region

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x="region", y="charges", data=df)
plt.title("Charges by Region")
plt.tight_layout()
plt.savefig("figs/charges_by_region.png", dpi=150)
plt.close()