In [1]:
import sys
import os
import random
import pandas as pd
sys.path.append(os.path.abspath('../scripts'))
sys.path.append(os.path.abspath('../src')) 

from load_data import DataLoader
import path 

from Hypothesis_Analysis import hypothesis

In [2]:
# Loading and reading text file using pandas
csv_path = path.get_clead_data()
data_load = DataLoader(csv_path)
df = data_load.load_csv_data()

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,TransactionMonth,IsVATRegistered,Citizenship,MaritalStatus,Gender,Province,PostalCode,VehicleType,RegistrationYear,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,0,2015-03-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,1,2015-05-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,2,2015-07-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,3,2015-05-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,4,2015-07-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


## A/B Hypothesis Testing
For this analysis, "risk" will be quantified by two metrics: Claim Frequency (proportion of policies with at least one claim) and Claim Severity (the average amount of a claim, given a claim occurred). "Margin" is defined as (TotalPremium - TotalClaims).


In [None]:
# Performs ANOVA test across groups. Filters out any groups with only one unique value.
 #       Returns p-value or warning message.

In [3]:
# Convert TotalPremium and TotalClaims to numeric
df["TotalPremium"] = pd.to_numeric(df["TotalPremium"], errors="coerce")
df["TotalClaims"] = pd.to_numeric(df["TotalClaims"], errors="coerce")

In [4]:
# Create new columns
df["HasClaim"] = df["TotalClaims"] > 0
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]

### 1. ANOVA: Risk differences across Provinces (Claim Frequency)

In [5]:
from scipy.stats import ttest_ind, f_oneway
province_groups = [group["HasClaim"].astype(int) for _, group in df.groupby("Province")]
anova_province_risk = f_oneway(*province_groups)

In [6]:
anova_province_risk.pvalue

np.float64(5.91210036318191e-19)

In [7]:
# 2. ANOVA: Risk differences between Zip Codes (Claim Frequency)
zip_groups = [group["HasClaim"].astype(int) for _, group in df.groupby("PostalCode")]
anova_zip_risk = f_oneway(*zip_groups)

In [8]:
# 3. ANOVA: Margin differences between Zip Codes
margin_groups = [group["Margin"].dropna() for _, group in df.groupby("PostalCode")]
anova_zip_margin = f_oneway(*margin_groups)

### Performs T-test between Male and Female on claim risk

In [9]:
# 4. T-test: Risk differences by Gender
men = df[df["Gender"] == "Male"]["HasClaim"].astype(int)
women = df[df["Gender"] == "Female"]["HasClaim"].astype(int)
ttest_gender_risk = ttest_ind(men, women, nan_policy="omit")

In [10]:
# Collect results
results = {
    "Province Risk ANOVA p-value": anova_province_risk.pvalue,
    "Zip Risk ANOVA p-value": anova_zip_risk.pvalue,
    "Zip Margin ANOVA p-value": anova_zip_margin.pvalue,
    "Gender Risk T-test p-value": ttest_gender_risk.pvalue
}

results

{'Province Risk ANOVA p-value': np.float64(5.91210036318191e-19),
 'Zip Risk ANOVA p-value': np.float64(2.9076595484940585e-30),
 'Zip Margin ANOVA p-value': np.float64(0.9976859758015036),
 'Gender Risk T-test p-value': np.float64(0.8404980845002314)}

### To perform controlled A/B testing on insurance plan features (e.g., AlarmImmobiliser, TrackingDevice, CoverType, etc.), we need to follow a clear and reproducible approach.

In [3]:
hypoth = hypothesis(df)

In [4]:
hypoth.compute_kpis()

Unnamed: 0.1,Unnamed: 0,TransactionMonth,IsVATRegistered,Citizenship,MaritalStatus,Gender,Province,PostalCode,VehicleType,RegistrationYear,...,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,HasClaim,ClaimSeverity,Margin
0,0,2015-03-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,0.0,21.929825
1,1,2015-05-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,0.0,21.929825
2,2,2015-07-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.000000,0.0,0,0.0,0.000000
3,3,2015-05-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.848070,0.0,0,0.0,512.848070
4,4,2015-07-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.000000,0.0,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000093,1000093,2015-04-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,347.235175,0.0,0,0.0,347.235175
1000094,1000094,2015-06-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,347.235175,0.0,0,0.0,347.235175
1000095,1000095,2015-08-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,347.235175,0.0,0,0.0,347.235175
1000096,1000096,2014-07-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,2.315000,0.0,0,0.0,2.315000


In [5]:
group_a, group_b = hypoth.create_ab_groups("Gender", "Male", "Female")

In [6]:
p_value = hypoth.compare_kpis(group_a, group_b, "Margin")
print("P-value for Margin difference between Male and Female:", p_value)

P-value for Margin difference between Male and Female: 0.8015464193501282


In [8]:
results, size_a, size_b = hypoth.run_ab_test("Gender", "Male", "Female")

In [9]:
print(f"Group Sizes -> A: {size_a}, B: {size_b}")
print("P-Values for KPIs:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

Group Sizes -> A: 42817, B: 6755
P-Values for KPIs:
HasClaim (p-value): 0.8372
ClaimSeverity (p-value): 0.7670
Margin (p-value): 0.8015
