In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/insurance.csv")

In [4]:
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']
df['ClaimSeverity'] = df['TotalClaims'].where(df['HasClaim'] == 1, np.nan)

In [6]:
# Segmentation and Hypothesis Testing

# H₀ #1: Risk difference across provinces (Chi-Square)
contingency_prov = pd.crosstab(df['Province'], df['HasClaim'])
chi2_1, p_1, _, _ = stats.chi2_contingency(contingency_prov)

# H₀ #2: Risk difference across ZIPs (Chi-Square)
top_zips = df['PostalCode'].value_counts().nlargest(5).index
df_zips = df[df['PostalCode'].isin(top_zips)]
contingency_zip = pd.crosstab(df_zips['PostalCode'], df_zips['HasClaim'])
chi2_2, p_2, _, _ = stats.chi2_contingency(contingency_zip)

# H₀ #3: Margin difference across ZIPs (ANOVA)
groups_margin = [grp['Margin'].dropna() for _, grp in df_zips.groupby('PostalCode')]
f_3, p_3 = stats.f_oneway(*groups_margin)

# H₀ #4: Risk difference by Gender (Chi-Square)
contingency_gender = pd.crosstab(df['Gender'], df['HasClaim'])
chi2_4, p_4, _, _ = stats.chi2_contingency(contingency_gender)

In [11]:
def interpret(name, p_value):
    decision = "Reject H₀" if p_value < 0.05 else "Fail to reject H₀"
    return f"{name}: {decision} (p = {p_value:.4f})"

results = [
    interpret("H₀ #1: No risk difference across provinces", p_1),
    interpret("H₀ #2: No risk difference across zip codes", p_2),
    interpret("H₀ #3: No margin difference across zip codes", p_3),
    interpret("H₀ #4: No risk difference between genders", p_4),
]

print("\n".join(results))

H₀ #1: No risk difference across provinces: Reject H₀ (p = 0.0000)
H₀ #2: No risk difference across zip codes: Reject H₀ (p = 0.0000)
H₀ #3: No margin difference across zip codes: Reject H₀ (p = 0.0469)
H₀ #4: No risk difference between genders: Reject H₀ (p = 0.0001)
