In [None]:
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency, f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../data/cleaned_data.csv")
# Make a copy to avoid modifying original
df_kpi = df.copy()

In [None]:
# Ensure TotalClaims and TotalPremium are numeric
df_kpi['TotalClaims'] = pd.to_numeric(df_kpi['TotalClaims'], errors='coerce')
df_kpi['TotalPremium'] = pd.to_numeric(df_kpi['TotalPremium'], errors='coerce')

# Define new KPI columns
# 1. Create a flag: whether a claim occurred
df_kpi['HasClaim'] = df_kpi['TotalClaims'] > 0

# 2. Claim Frequency (will be calculated by group)
#    For each group: sum(HasClaim) / count()

# 3. Claim Severity (only for rows with claims > 0)
#    total claims / number of policies with a claim
#    This will be calculated by filtering rows

# 4. Margin (row-level)
df_kpi['Margin'] = df_kpi['TotalPremium'] - df_kpi['TotalClaims']

# Optional: preview the new KPIs
print(df_kpi[['TotalPremium', 'TotalClaims', 'HasClaim', 'Margin']].head())

A/B Test for Gender-Based Risk

In [None]:
# --- STEP 1: Prepare Data ---
print("🧹 Preparing gender-based groups for testing...")

# Clean Gender field (drop NA)
df_gender = df_kpi.dropna(subset=['Gender'])

# Ensure only Male and Female are compared
df_gender = df_gender[df_gender['Gender'].isin(['Male', 'Female'])]

# Create groups
group_male = df_gender[df_gender['Gender'] == 'Male']
group_female = df_gender[df_gender['Gender'] == 'Female']

# --- STEP 2: Claim Frequency Test ---
print("\n📊 Testing Claim Frequency between Male and Female:")

# Count how many policies had claims
male_claim_rate = group_male['HasClaim'].mean()
female_claim_rate = group_female['HasClaim'].mean()

print(f"✅ Male Claim Frequency: {male_claim_rate:.3f}")
print(f"✅ Female Claim Frequency: {female_claim_rate:.3f}")

# Create contingency table
freq_table = pd.crosstab(df_gender['Gender'], df_gender['HasClaim'])

# Chi-squared test
chi2, p_freq, _, _ = chi2_contingency(freq_table)
print(f"🔍 Chi-squared test p-value: {p_freq:.4f}")
if p_freq < 0.05:
    print("🟢 Statistically significant difference in claim frequency between genders.")
else:
    print("⚪ No statistically significant difference in claim frequency.")

# --- STEP 3: Claim Severity Test ---
print("\n📊 Testing Claim Severity between Male and Female:")

# Filter only those with claims
male_with_claims = group_male[group_male['HasClaim']]
female_with_claims = group_female[group_female['HasClaim']]

# Severity = TotalClaims / policies with claim
ttest_severity = ttest_ind(male_with_claims['TotalClaims'], female_with_claims['TotalClaims'], equal_var=False)
print(f"✅ Male Average Claim Severity: {male_with_claims['TotalClaims'].mean():.2f}")
print(f"✅ Female Average Claim Severity: {female_with_claims['TotalClaims'].mean():.2f}")
print(f"🔍 T-test p-value: {ttest_severity.pvalue:.4f}")
if ttest_severity.pvalue < 0.05:
    print("🟢 Statistically significant difference in claim severity.")
else:
    print("⚪ No statistically significant difference in claim severity.")

# --- STEP 4: Margin Comparison ---
print("\n📊 Testing Margin between Male and Female:")

ttest_margin = ttest_ind(group_male['Margin'], group_female['Margin'], equal_var=False)
print(f"✅ Male Avg. Margin: {group_male['Margin'].mean():.2f}")
print(f"✅ Female Avg. Margin: {group_female['Margin'].mean():.2f}")
print(f"🔍 T-test p-value: {ttest_margin.pvalue:.4f}")
if ttest_margin.pvalue < 0.05:
    print("🟢 Statistically significant difference in margin.")
else:
    print("⚪ No statistically significant difference in margin.")

Statistical Tests by Province

In [None]:

# --- STEP 1: Prepare Data ---
print("🧹 Preparing province-based data for testing...")

# Drop NA in Province
df_prov = df_kpi.dropna(subset=['Province'])

# Only keep provinces with sufficient data (optional)
province_counts = df_prov['Province'].value_counts()
valid_provinces = province_counts[province_counts > 500].index.tolist()  # arbitrary threshold
df_prov = df_prov[df_prov['Province'].isin(valid_provinces)]

# --- STEP 2: Claim Frequency (Chi-squared Test) ---
print("\n📊 Testing Claim Frequency across Provinces:")

freq_table_prov = pd.crosstab(df_prov['Province'], df_prov['HasClaim'])
chi2, p_freq_prov, _, _ = chi2_contingency(freq_table_prov)

print(freq_table_prov)
print(f"🔍 Chi-squared test p-value: {p_freq_prov:.4f}")
if p_freq_prov < 0.05:
    print("🟢 Statistically significant difference in claim frequency across provinces.")
else:
    print("⚪ No statistically significant difference in claim frequency.")

# --- STEP 3: Claim Severity (ANOVA) ---
print("\n📊 Testing Claim Severity across Provinces (ANOVA):")

# Filter only those with claims
df_claims = df_prov[df_prov['HasClaim']]

# Perform one-way ANOVA on TotalClaims
model_claims = ols('TotalClaims ~ C(Province)', data=df_claims).fit()
anova_claims = sm.stats.anova_lm(model_claims, typ=2)

print(anova_claims)
p_claim_severity = anova_claims['PR(>F)'][0]
if p_claim_severity < 0.05:
    print("🟢 Statistically significant difference in claim severity across provinces.")
else:
    print("⚪ No statistically significant difference in claim severity.")

# --- STEP 4: Margin (ANOVA) ---
print("\n📊 Testing Margin across Provinces (ANOVA):")

model_margin = ols('Margin ~ C(Province)', data=df_prov).fit()
anova_margin = sm.stats.anova_lm(model_margin, typ=2)

print(anova_margin)
p_margin = anova_margin['PR(>F)'][0]
if p_margin < 0.05:
    print("🟢 Statistically significant difference in margin across provinces.")
else:
    print("⚪ No statistically significant difference in margin.")

In [None]:
# Set style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# --- 1. Claim Frequency Bar Plot ---
print("📈 Plotting Claim Frequency by Province...")

claim_freq_by_prov = df_prov.groupby('Province')['HasClaim'].mean().sort_values(ascending=False)

sns.barplot(x=claim_freq_by_prov.index, y=claim_freq_by_prov.values, palette="Blues_d")
plt.title("Claim Frequency by Province")
plt.ylabel("Proportion of Policies with Claims")
plt.xlabel("Province")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- 2. Claim Severity Box Plot ---
print("📈 Plotting Claim Severity by Province...")

sns.boxplot(data=df_claims, x='Province', y='TotalClaims', palette="Oranges", showfliers=False)
plt.title("Claim Severity by Province")
plt.ylabel("Total Claims Amount (ZAR)")
plt.xlabel("Province")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- 3. Margin Box Plot ---
print("📈 Plotting Margin by Province...")

sns.boxplot(data=df_prov, x='Province', y='Margin', palette="Greens", showfliers=False)
plt.title("Margin (TotalPremium - TotalClaims) by Province")
plt.ylabel("Margin (ZAR)")
plt.xlabel("Province")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()