In [2]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [3]:
# Load data
df = pd.read_csv('../data/MachineLearningRating_v3_sample.csv')

In [4]:
# Define metrics
df['HasClaim'] = df['TotalClaims'] > 0
df['Margin'] = df['TotalPremium'] - df['TotalClaims']
claim_severity = df[df['HasClaim']]['TotalClaims']

In [5]:
# Hypothesis 1: Risk differences across provinces
# Claim Frequency
province_freq = df.groupby('Province')['HasClaim'].mean()
chi2, p_freq_prov, _, _ = stats.chi2_contingency(pd.crosstab(df['Province'], df['HasClaim']))
print(f"Province Claim Frequency: Chi2 p-value = {p_freq_prov}")

Province Claim Frequency: Chi2 p-value = 1.0


In [7]:
# Generate groups only from provinces with at least one claim
severity_by_prov = [
    df[(df['Province'] == prov) & (df['HasClaim'])]['TotalClaims']
    for prov in df['Province'].unique()
    if not df[(df['Province'] == prov) & (df['HasClaim'])]['TotalClaims'].empty
]

# Run ANOVA if we have at least 2 non-empty groups
if len(severity_by_prov) >= 2:
    f_stat, p_sev_prov = stats.f_oneway(*severity_by_prov)
    print(f"Province Claim Severity: ANOVA p-value = {p_sev_prov}")
else:
    print("Not enough groups with claims to perform ANOVA.")


Not enough groups with claims to perform ANOVA.


In [8]:
# Hypothesis 2: Risk differences between zip codes
# Claim Frequency
zip_freq = df.groupby('PostalCode')['HasClaim'].mean()
chi2, p_freq_zip, _, _ = stats.chi2_contingency(pd.crosstab(df['PostalCode'], df['HasClaim']))
print(f"Zip Code Claim Frequency: Chi2 p-value = {p_freq_zip}")

Zip Code Claim Frequency: Chi2 p-value = 1.0


In [9]:
# Claim Severity (ANOVA)
severity_by_zip = [df[df['PostalCode'] == zip_]['TotalClaims'][df['HasClaim']] for zip_ in df['PostalCode'].unique()]
f_stat, p_sev_zip = stats.f_oneway(*severity_by_zip)
print(f"Zip Code Claim Severity: ANOVA p-value = {p_sev_zip}")

Zip Code Claim Severity: ANOVA p-value = nan


  f_stat, p_sev_zip = stats.f_oneway(*severity_by_zip)


In [10]:
# Hypothesis 3: Margin differences between zip codes
margin_by_zip = [df[df['PostalCode'] == zip_]['Margin'] for zip_ in df['PostalCode'].unique()]
f_stat, p_margin_zip = stats.f_oneway(*margin_by_zip)
print(f"Zip Code Margin: ANOVA p-value = {p_margin_zip}")

Zip Code Margin: ANOVA p-value = 2.850723461156593e-06


In [11]:
# Hypothesis 4: Risk differences between genders
# Claim Frequency
gender_freq = df.groupby('Gender')['HasClaim'].mean()
chi2, p_freq_gender, _, _ = stats.chi2_contingency(pd.crosstab(df['Gender'], df['HasClaim']))
print(f"Gender Claim Frequency: Chi2 p-value = {p_freq_gender}")

Gender Claim Frequency: Chi2 p-value = 1.0


In [12]:
# Claim Severity (t-test)
severity_male = df[(df['Gender'] == 'Male') & df['HasClaim']]['TotalClaims']
severity_female = df[(df['Gender'] == 'Female') & df['HasClaim']]['TotalClaims']
t_stat, p_sev_gender = stats.ttest_ind(severity_male, severity_female)
print(f"Gender Claim Severity: t-test p-value = {p_sev_gender}")

Gender Claim Severity: t-test p-value = nan


  return f(*args, **kwargs)


In [13]:
# Business Recommendations
if p_freq_prov < 0.05:
    print("Reject H0: Significant risk differences across provinces. Adjust premiums by region.")
if p_sev_zip < 0.05:
    print("Reject H0: Significant risk differences between zip codes. Target low-risk zip codes for marketing.")
if p_margin_zip < 0.05:
    print("Reject H0: Significant margin differences between zip codes. Optimize pricing in high-margin areas.")
if p_sev_gender < 0.05:
    print("Reject H0: Significant risk differences between genders. Consider gender-based pricing adjustments.")

Reject H0: Significant margin differences between zip codes. Optimize pricing in high-margin areas.
