In [1]:
import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind,f_oneway

In [2]:
data = pd.read_csv('../data/MLData.txt', delimiter='|')

In [15]:
data.head(10)

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,VehicleAge,margin,hasClaimed
0,145249,12827,2015-03-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,21,21.929825,False
1,145249,12827,2015-05-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,21,21.929825,False
2,145249,12827,2015-07-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,21,0.0,False
3,145255,12827,2015-05-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0,21,512.84807,False
4,145255,12827,2015-07-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,21,0.0,False
5,145247,12827,2015-01-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,3.256435,0.0,21,3.256435,False
6,145247,12827,2015-04-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,50.474737,0.0,21,50.474737,False
7,145247,12827,2015-06-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,35.332316,0.0,21,35.332316,False
8,145247,12827,2015-08-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,21,0.0,False
9,145245,12827,2015-03-01,True,Not specified,Close Corporation,Mr,English,First National Bank,Current account,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,1.009474,0.0,21,1.009474,False


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Data columns (total 50 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   UnderwrittenCoverID       1000098 non-null  int64  
 1   PolicyID                  1000098 non-null  int64  
 2   TransactionMonth          1000098 non-null  object 
 3   IsVATRegistered           1000098 non-null  bool   
 4   Citizenship               1000098 non-null  object 
 5   LegalType                 1000098 non-null  object 
 6   Title                     1000098 non-null  object 
 7   Language                  1000098 non-null  object 
 8   Bank                      1000098 non-null  object 
 9   AccountType               1000098 non-null  object 
 10  MaritalStatus             1000098 non-null  object 
 11  Gender                    1000098 non-null  object 
 12  Country                   1000098 non-null  object 
 13  Province                  1

In [4]:
data['margin'] = data['TotalPremium'] - data['TotalClaims'];
data['hasClaimed'] = data['TotalClaims']>0

In [5]:
group_a = data[data['hasClaimed']==True].copy()
group_b = data[data['hasClaimed']==False].copy()

In [6]:
client_attr = [
    'Gender', 'MaritalStatus', 'Citizenship', 'Province','PostalCode',
    'MainCrestaZone', 'SubCrestaZone','Country'
]

auto_attr = [
    'VehicleAge', 'Cylinders', 'cubiccapacity', 'kilowatts',
    'NumberOfDoors', 'make', 'Model', 'bodyType', 'NewVehicle','TrackingDevice',
    'AlarmImmobiliser'
]

insurance_type = [
    'ItemType', 'CoverType', 'TransactionMonth'
    'TermFrequency', 'AccountType', 'CoverageCategory','CoverGroup','Section',
    'Product', 'StatutoryClass', 'ExcessSelected', 'StatutoryRiskType'
]

In [7]:
alpha = 0.05

# --- Helper Function for Chi-Squared Test ---
def chi_squared_test_for_equivalence(group1_df, group2_df, column_name):
    """Performs a Chi-Squared test for independence between groups for a categorical column."""
    if column_name not in group1_df.columns or column_name not in group2_df.columns:
        print(f"  Skipping '{column_name}': Not found in one or both groups.")
        return False, 1.0 # Return False for significance and p-value 1.0

    # Ensure the column is treated as categorical
    combined_data = pd.concat([
        group1_df[[column_name]].assign(Group='A'),
        group2_df[[column_name]].assign(Group='B')
    ])
    contingency_table = pd.crosstab(combined_data['Group'], combined_data[column_name])

    # Check if the contingency table is valid for chi-squared test
    # (e.g., no rows/columns with all zeros or single value)
    if contingency_table.empty or contingency_table.shape[0] < 2 or contingency_table.shape[1] < 2:
        print(f"  Skipping Chi-Squared for '{column_name}': Insufficient data or invalid contingency table.")
        return False, 1.0

    # Check for zero marginals (rows/columns sum to zero)
    if (contingency_table.sum(axis=0) == 0).any() or (contingency_table.sum(axis=1) == 0).any():
        print(f"  Skipping Chi-Squared for '{column_name}': Zero marginals detected, test not applicable.")
        return False, 1.0

    # Handle cases where expected frequencies are too low
    # chi2_contingency automatically warns for this. For small datasets, this is common.
    try:
        chi2, p_value, _, _ = chi2_contingency(contingency_table)
    except ValueError as e:
        print(f"  Chi-Squared test failed for '{column_name}' due to ValueError: {e}. Likely sparse data.")
        return False, 1.0 # Assume non-significant if test cannot be performed

    is_significant = p_value < alpha
    print(f"  '{column_name}': Chi2={chi2:.2f}, p-value={p_value:.4f} -> {'Significant' if is_significant else 'Not Significant'}")
    return is_significant, p_value

# --- Helper Function for T-Test ---
def t_test_for_equivalence(group1_df, group2_df, column_name):
    """Performs an independent samples t-test for equivalence between groups for a numerical column."""
    if column_name not in group1_df.columns or column_name not in group2_df.columns:
        print(f"  Skipping '{column_name}': Not found in one or both groups.")
        return False, 1.0 # Return False for significance and p-value 1.0

    # Drop NaNs for the specific column to avoid issues with t-test
    data_g1 = group1_df[column_name].dropna()
    data_g2 = group2_df[column_name].dropna()

    if len(data_g1) < 2 or len(data_g2) < 2: # t-test requires at least 2 samples per group
        print(f"  Skipping T-test for '{column_name}': Insufficient data after dropping NaNs (Group A: {len(data_g1)}, Group B: {len(data_g2)})")
        return False, 1.0

    t_stat, p_value = ttest_ind(data_g1, data_g2, equal_var=False) # Welch's t-test, more robust for unequal variances
    is_significant = p_value < alpha
    print(f"  '{column_name}': T-stat={t_stat:.2f}, p-value={p_value:.4f} -> {'Significant' if is_significant else 'Not Significant'}")
    return is_significant, p_value

In [8]:
equivalence_issues = []

print("\n--- Client Confounding Variables ---")
for col in client_attr:
    is_sig, p_val = chi_squared_test_for_equivalence(group_a, group_b, col)
    if is_sig:
        equivalence_issues.append(f"Categorical '{col}' (p={p_val:.4f})")

if not equivalence_issues:
    print("\n--- Equivalence Check Result: PASSED ---")
    print("Groups A and B appear statistically equivalent across all tested confounding variables.")
    print("You can proceed with the main A/B hypothesis tests with reasonable confidence.")
else:
    print("\n--- Equivalence Check Result: FAILED ---")
    print("Warning: Groups A and B are NOT statistically equivalent on the following variables:")
    for issue in equivalence_issues:
        print(f"- {issue}")
    print("\nThis means any observed differences in risk metrics might be due to these confounding factors.")
    print("Consider re-grouping, using propensity score matching, or applying regression models to control for these variables.")


--- Client Confounding Variables ---
  'Gender': Chi2=2.99, p-value=0.2238 -> Not Significant
  'MaritalStatus': Chi2=3.47, p-value=0.1765 -> Not Significant
  'Citizenship': Chi2=2.15, p-value=0.5427 -> Not Significant
  'Province': Chi2=85.44, p-value=0.0000 -> Significant
  'PostalCode': Chi2=1109.84, p-value=0.0000 -> Significant
  'MainCrestaZone': Chi2=126.03, p-value=0.0000 -> Significant
  'SubCrestaZone': Chi2=175.98, p-value=0.0000 -> Significant
  Skipping Chi-Squared for 'Country': Insufficient data or invalid contingency table.

--- Equivalence Check Result: FAILED ---
- Categorical 'Province' (p=0.0000)
- Categorical 'PostalCode' (p=0.0000)
- Categorical 'MainCrestaZone' (p=0.0000)
- Categorical 'SubCrestaZone' (p=0.0000)

This means any observed differences in risk metrics might be due to these confounding factors.
Consider re-grouping, using propensity score matching, or applying regression models to control for these variables.


In [None]:


    # --- 4. A/B Hypothesis Tests for Risk Metrics for current Province ---
print(f"\n--- Performing A/B Hypothesis Tests on Risk Metrics for Gender ---")
# --- Hypothesis 1: Claim Frequency ---
print("\n--- Hypothesis: Claim Frequency (TrackingDevice = No vs. Yes) ---")
group_a['HasClaim'] = group_a['TotalClaims'] > 0
group_b['HasClaim'] = group_b['TotalClaims'] > 0
contingency_table_claims = pd.crosstab(
    pd.Series(['Group A'] * len(group_a) + ['Group B'] * len(group_b), name='Group'),
    pd.Series(group_a['HasClaim'].tolist() + group_b['HasClaim'].tolist(), name='HasClaim')
)
print("\n  Contingency Table (Group vs. HasClaim):\n", contingency_table_claims)
if contingency_table_claims.empty or contingency_table_claims.shape[0] < 2 or contingency_table_claims.shape[1] < 2:
    print("\n  Skipping Chi-Squared test for Claim Frequency: Insufficient data or invalid contingency table.")
elif (contingency_table_claims.sum(axis=0) == 0).any() or (contingency_table_claims.sum(axis=1) == 0).any():
    print("\n  Skipping Chi-Squared test for Claim Frequency: Zero marginals detected, test not applicable.")
else:
    chi2_cf, p_value_cf, _, _ = chi2_contingency(contingency_table_claims)
    print(f"\n  Chi-Squared Test for Claim Frequency:")
    print(f"  Chi2 Statistic: {chi2_cf:.2f}")
    print(f"  P-value: {p_value_cf:.4f}")
    if p_value_cf < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Frequency between Group A (No Tracking Device) and Group B (Yes Tracking Device).")
        freq_a = group_a['HasClaim'].mean()
        freq_b = group_b['HasClaim'].mean()
        print(f"    Claim Frequency (Group A - No Tracking): {freq_a:.4f}")
        print(f"    Claim Frequency (Group B - Yes Tracking): {freq_b:.4f}")
        if freq_b < freq_a:
            print("    Interpretation: Policies with a tracking device (Group B) have a lower claim frequency in this province.")
        else:
            print("    Interpretation: Policies with a tracking device (Group B) have a higher claim frequency in this province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Frequency between Group A and Group B in this province.")
# --- Hypothesis 2: Claim Severity ---
print("\n--- Hypothesis: Claim Severity (TrackingDevice = No vs. Yes) ---")
claims_a = group_a[group_a['TotalClaims'] > 0]['TotalClaims'].dropna()
claims_b = group_b[group_b['TotalClaims'] > 0]['TotalClaims'].dropna()
if len(claims_a) < 2 or len(claims_b) < 2:
    print("\n  Skipping T-test for Claim Severity: Insufficient claim data in one or both groups after filtering.")
else:
    t_stat_cs, p_value_cs = ttest_ind(claims_a, claims_b, equal_var=False) # Welch's t-test
    print(f"\n  T-Test for Claim Severity:")
    print(f"  T-Statistic: {t_stat_cs:.2f}")
    print(f"  P-value: {p_value_cs:.4f}")
    if p_value_cs < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Severity between Group A (No Tracking Device) and Group B (Yes Tracking Device).")
        avg_severity_a = claims_a.mean()
        avg_severity_b = claims_b.mean()
        print(f"    Avg Claim Severity (Group A - No Tracking): {avg_severity_a:.2f}")
        print(f"    Avg Claim Severity (Group B - Yes Tracking): {avg_severity_b:.2f}")
        if avg_severity_b < avg_severity_a:
            print("    Interpretation: Policies with a tracking device (Group B) have lower average claim severity in this province.")
        else:
            print("    Interpretation: Policies with a tracking device (Group B) have higher average claim severity in this province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Severity between Group A and Group B in this province.")
# --- Hypothesis 3: Margin ---
print("\n--- Hypothesis: Margin (TotalPremium - TotalClaims) (TrackingDevice = No vs. Yes) ---")
group_a['Margin'] = group_a['TotalPremium'] - group_a['TotalClaims']
group_b['Margin'] = group_b['TotalPremium'] - group_b['TotalClaims']
margin_a = group_a['Margin'].dropna()
margin_b = group_b['Margin'].dropna()
if len(margin_a) < 2 or len(margin_b) < 2:
    print("\n  Skipping T-test for Margin: Insufficient data in one or both groups.")
else:
    t_stat_margin, p_value_margin = ttest_ind(margin_a, margin_b, equal_var=False) # Welch's t-test
    print(f"\n  T-Test for Margin:")
    print(f"  T-Statistic: {t_stat_margin:.2f}")
    print(f"  P-value: {p_value_margin:.4f}")
    if p_value_margin < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Margin between Group A (No Tracking Device) and Group B (Yes Tracking Device).")
        avg_margin_a = margin_a.mean()
        avg_margin_b = margin_b.mean()
        print(f"    Avg Margin (Group A - No Tracking): {avg_margin_a:.2f}")
        print(f"    Avg Margin (Group B - Yes Tracking): {avg_margin_b:.2f}")
        if avg_margin_b > avg_margin_a:
            print("    Interpretation: Policies with a tracking device (Group B) yield a higher average margin in this province.")
        else:
            print("    Interpretation: Policies with a tracking device (Group B) yield a lower average margin in this province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Margin between Group A and Group B in this province.")

# --- NEW SECTION: Overall Risk Differences Across Provinces ---
print("\n############################################################")
print("### Overall Risk Differences Across Provinces (Testing H0: No Risk Differences) ###")
print("############################################################")

# Hypothesis for overall provinces:
# H0: There are no statistically significant differences in Claim Frequency/Severity/Margin across all provinces.
# H1: There is a statistically significant difference in Claim Frequency/Severity/Margin across at least two provinces.

# --- Overall Claim Frequency Across Provinces ---
print("\n--- Overall Claim Frequency Across Provinces ---")
df_overall = data.copy()
df_overall['HasClaim'] = df_overall['TotalClaims'] > 0

# Create a contingency table for Province and HasClaim for the entire dataset
overall_contingency_table = pd.crosstab(df_overall['Province'], df_overall['HasClaim'])
print("\nOverall Contingency Table (Province vs. HasClaim):\n", overall_contingency_table)

if overall_contingency_table.empty or overall_contingency_table.shape[0] < 2 or overall_contingency_table.shape[1] < 2:
    print("\n  Skipping Chi-Squared test for overall Claim Frequency: Insufficient data or invalid contingency table for provinces.")
elif (overall_contingency_table.sum(axis=0) == 0).any() or (overall_contingency_table.sum(axis=1) == 0).any():
    print("\n  Skipping Chi-Squared test for overall Claim Frequency: Zero marginals detected, test not applicable.")
else:
    chi2_overall_cf, p_value_overall_cf, _, _ = chi2_contingency(overall_contingency_table)
    print(f"\n  Chi-Squared Test for Overall Claim Frequency Across Provinces:")
    print(f"  Chi2 Statistic: {chi2_overall_cf:.2f}")
    print(f"  P-value: {p_value_overall_cf:.4f}")
    if p_value_overall_cf < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Frequency across provinces.")
        print("  This suggests that claim frequency varies significantly by province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Frequency across provinces.")
        print("  This suggests that claim frequency does not vary significantly by province (based on this data and alpha).")

# --- Overall Claim Severity Across Provinces ---
print("\n--- Overall Claim Severity Across Provinces ---")
# Filter for policies with claims for overall severity analysis
claims_df_overall = data[data['TotalClaims'] > 0].copy()

# Prepare data for ANOVA - list of claim amounts for each province
province_claims_for_anova = []
provinces_with_claims_data = [] # To store names of provinces with data for ANOVA
for province in claims_df_overall['Province'].unique():
    province_data = claims_df_overall[claims_df_overall['Province'] == province]['TotalClaims'].dropna().values
    if len(province_data) > 0: # Only include provinces that actually have claims data
        province_claims_for_anova.append(province_data)
        provinces_with_claims_data.append(province)

print(f"\nProvinces with claims data for Severity ANOVA: {provinces_with_claims_data}")

if len(province_claims_for_anova) >= 2:
    f_statistic_overall_cs, p_value_overall_cs = f_oneway(*province_claims_for_anova)
    print(f"\n  One-Way ANOVA for Overall Claim Severity Across Provinces:")
    print(f"  F-Statistic: {f_statistic_overall_cs:.2f}")
    print(f"  P-value: {p_value_overall_cs:.4f}")
    if p_value_overall_cs < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Severity across provinces.")
        print("  This suggests that the average amount of a claim varies significantly by province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Severity across provinces.")
        print("  This suggests that the average amount of a claim does not vary significantly by province.")
else:
    print("\n  Skipping ANOVA for overall Claim Severity: Not enough provinces with claims data (need at least 2).")
    print("  To test this, ensure your dataset includes claims from at least two different provinces.")


# --- Overall Margin Across Provinces ---
print("\n--- Overall Margin Across Provinces ---")
df_overall['Margin'] = df_overall['TotalPremium'] - df_overall['TotalClaims']

# Prepare data for ANOVA - list of margins for each province
province_margins_for_anova = []
provinces_with_margin_data = [] # To store names of provinces with data for ANOVA
for province in df_overall['Province'].unique():
    province_data = df_overall[df_overall['Province'] == province]['Margin'].dropna().values
    if len(province_data) > 0: # Only include provinces that actually have margin data
        province_margins_for_anova.append(province_data)
        provinces_with_margin_data.append(province)

print(f"\nProvinces with margin data for Margin ANOVA: {provinces_with_margin_data}")

if len(province_margins_for_anova) >= 2:
    f_statistic_overall_margin, p_value_overall_margin = f_oneway(*province_margins_for_anova)
    print(f"\n  One-Way ANOVA for Overall Margin Across Provinces:")
    print(f"  F-Statistic: {f_statistic_overall_margin:.2f}")
    print(f"  P-value: {p_value_overall_margin:.4f}")
    if p_value_overall_margin < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Margin across provinces.")
        print("  This suggests that the average margin varies significantly by province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Margin across provinces.")
        print("  This suggests that the average margin does not vary significantly by province.")
else:
    print("\n  Skipping ANOVA for overall Margin: Not enough provinces with margin data (need at least 2).")


############################################################
### Analyzing Province: Gauteng ###
############################################################
  Group A (TrackingDevice = No) in Gauteng: 282420 policies
  Group B (TrackingDevice = Yes) in Gauteng: 111445 policies

--- Checking Statistical Equivalence Between Groups A and B in Gauteng ---
  Significance Level (alpha): 0.05

  --- Categorical Confounding Variables ---

  --- Numerical Confounding Variables ---
  'TotalPremium': p-value=0.8193 -> Not Significant

--- Equivalence Check Result for Gauteng: FAILED ---
  - Categorical 'Gender' (p=0.0000)
  - Categorical 'MaritalStatus' (p=0.0000)
  - Categorical 'LegalType' (p=0.0000)
  - Categorical 'CoverType' (p=0.0000)
  - Categorical 'TermFrequency' (p=0.0314)
  - Numerical 'RegistrationYear' (p=0.0000)
  - Numerical 'SumInsured' (p=0.0001)
  - Numerical 'CalculatedPremiumPerTerm' (p=0.0000)
  - Numerical 'Cylinders' (p=0.0000)
  - Numerical 'cubiccapacity' (p=0.0000)
  -

In [9]:

# Ensure TotalClaims and TotalPremium are numeric
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce').fillna(0)
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce').fillna(0)

# Set significance level
alpha = 0.05

# --- Helper Function for Chi-Squared Test ---
def chi_squared_test_for_equivalence(group1_df, group2_df, column_name, alpha, test_name=""):
    """Performs a Chi-Squared test for independence between groups for a categorical column."""
    if column_name not in group1_df.columns or column_name not in group2_df.columns:
        # print(f"  Skipping '{column_name}': Not found in one or both groups.")
        return False, 1.0 # Return False for significance and p-value 1.0

    # Ensure the column is treated as categorical
    combined_data = pd.concat([
        group1_df[[column_name]].assign(Group='A'),
        group2_df[[column_name]].assign(Group='B')
    ])
    contingency_table = pd.crosstab(combined_data['Group'], combined_data[column_name])

    # Check if the contingency table is valid for chi-squared test
    if contingency_table.empty or contingency_table.shape[0] < 2 or contingency_table.shape[1] < 2:
        # print(f"  Skipping Chi-Squared for '{column_name}': Insufficient data or invalid contingency table.")
        return False, 1.0

    # Check for zero marginals (rows/columns sum to zero)
    if (contingency_table.sum(axis=0) == 0).any() or (contingency_table.sum(axis=1) == 0).any():
        # print(f"  Skipping Chi-Squared for '{column_name}': Zero marginals detected, test not applicable.")
        return False, 1.0

    # Handle cases where expected frequencies are too low
    try:
        chi2, p_value, _, _ = chi2_contingency(contingency_table)
    except ValueError as e:
        # print(f"  Chi-Squared test failed for '{column_name}' due to ValueError: {e}. Likely sparse data.")
        return False, 1.0 # Assume non-significant if test cannot be performed

    is_significant = p_value < alpha
    # print(f"  '{column_name}': Chi2={chi2:.2f}, p-value={p_value:.4f} -> {'Significant' if is_significant else 'Not Significant'}")
    return is_significant, p_value

# --- Helper Function for T-Test ---
def t_test_for_equivalence(group1_df, group2_df, column_name, alpha, test_name=""):
    """Performs an independent samples t-test for equivalence between groups for a numerical column."""
    if column_name not in group1_df.columns or column_name not in group2_df.columns:
        # print(f"  Skipping '{column_name}': Not found in one or both groups.")
        return False, 1.0 # Return False for significance and p-value 1.0

    # Drop NaNs for the specific column to avoid issues with t-test
    data_g1 = group1_df[column_name].dropna()
    data_g2 = group2_df[column_name].dropna()

    if len(data_g1) < 2 or len(data_g2) < 2: # t-test requires at least 2 samples per group
        # print(f"  Skipping T-test for '{column_name}': Insufficient data after dropping NaNs (Group A: {len(data_g1)}, Group B: {len(data_g2)})")
        return False, 1.0

    t_stat, p_value = ttest_ind(data_g1, data_g2, equal_var=False) # Welch's t-test, more robust for unequal variances
    is_significant = p_value < alpha
    # print(f"  '{column_name}': T-stat={t_stat:.2f}, p-value={p_value:.4f} -> {'Significant' if is_significant else 'Not Significant'}")
    return is_significant, p_value

# --- Common Confounding Variables ---
common_categorical_confounding_vars = [
    'Gender', 'MaritalStatus', 'LegalType', 'Country',
    'ItemType', 'CoverType', 'StatutoryClass', 'TermFrequency',
    'AlarmImmobiliser', 'NewVehicle' # Add others here as needed
]
common_numerical_confounding_vars = [
    'RegistrationYear', 'CustomValueEstimate', 'SumInsured',
    'CalculatedPremiumPerTerm', 'Cylinders', 'cubiccapacity', 'kilowatts',
    'NumberOfDoors', 'CapitalOutstanding', 'TotalPremium' # TotalPremium is also a potential confounder for some analyses
]


# --- A/B Test Function (re-usable for different features) ---
def run_ab_test(df_input, feature_col, group_a_val, group_b_val, alpha, analysis_name):
    """
    Runs an A/B hypothesis test for a given feature, including equivalence checks
    and tests for Claim Frequency, Claim Severity, and Margin.
    """
    print(f"\n############################################################")
    print(f"### A/B Testing for Feature: '{feature_col}' ({analysis_name}) ###")
    print(f"############################################################")

    # Define Groups A and B
    group_a = df_input[df_input[feature_col] == group_a_val].copy()
    group_b = df_input[df_input[feature_col] == group_b_val].copy()

    print(f"  Group A ('{feature_col}' = '{group_a_val}'): {len(group_a)} policies")
    print(f"  Group B ('{feature_col}' = '{group_b_val}'): {len(group_b)} policies\n")

    if len(group_a) == 0 or len(group_b) == 0:
        print(f"  Skipping analysis for '{feature_col}': One or both groups are empty.")
        return # Exit function if groups are empty

    # --- Statistical Validation of Equivalence (Confounding Variables) ---
    print(f"--- Checking Statistical Equivalence Between Groups A and B ---")
    print(f"  Significance Level (alpha): {alpha}\n")

    equivalence_issues = []

    # Create a list of confounding variables excluding the feature_col itself
    # and also excluding Province and PostalCode for this generic A/B test function
    # as these might be directly tested or handled by outer loops.
    current_categorical_confounding_vars = [col for col in common_categorical_confounding_vars if col != feature_col]
    current_numerical_confounding_vars = [col for col in common_numerical_confounding_vars if col != feature_col]


    print("  --- Categorical Confounding Variables ---")
    for col in current_categorical_confounding_vars:
        is_sig, p_val = chi_squared_test_for_equivalence(group_a, group_b, col, alpha)
        if is_sig:
            equivalence_issues.append(f"Categorical '{col}' (p={p_val:.4f})")
        if not is_sig and p_val != 1.0:
            print(f"  '{col}': p-value={p_val:.4f} -> Not Significant")


    print("\n  --- Numerical Confounding Variables ---")
    for col in current_numerical_confounding_vars:
        is_sig, p_val = t_test_for_equivalence(group_a, group_b, col, alpha)
        if is_sig:
            equivalence_issues.append(f"Numerical '{col}' (p={p_val:.4f})")
        if not is_sig and p_val != 1.0:
            print(f"  '{col}': p-value={p_val:.4f} -> Not Significant")


    if not equivalence_issues:
        print(f"\n--- Equivalence Check Result: PASSED ---")
        print("  Groups A and B appear statistically equivalent across all tested confounding variables.")
        print("  You can proceed with the main A/B hypothesis tests with reasonable confidence.")
    else:
        print(f"\n--- Equivalence Check Result: FAILED ---")
        print("  Warning: Groups A and B are NOT statistically equivalent on the following variables:")
        for issue in equivalence_issues:
            print(f"  - {issue}")
        print("\n  This means any observed differences in risk metrics might be due to these confounding factors.")
        print("  Consider re-grouping, using propensity score matching, or applying regression models to control for these variables.")

    # --- A/B Hypothesis Tests for Risk Metrics ---
    print(f"\n--- Performing A/B Hypothesis Tests on Risk Metrics for '{feature_col}' ---")

    # --- Hypothesis 1: Claim Frequency ---
    print("\n--- Hypothesis: Claim Frequency ---")
    group_a['HasClaim'] = group_a['TotalClaims'] > 0
    group_b['HasClaim'] = group_b['TotalClaims'] > 0

    contingency_table_claims = pd.crosstab(
        pd.Series([group_a_val] * len(group_a) + [group_b_val] * len(group_b), name=feature_col),
        pd.Series(group_a['HasClaim'].tolist() + group_b['HasClaim'].tolist(), name='HasClaim')
    )

    print(f"\n  Contingency Table ({feature_col} vs. HasClaim):\n", contingency_table_claims)

    if contingency_table_claims.empty or contingency_table_claims.shape[0] < 2 or contingency_table_claims.shape[1] < 2:
        print(f"\n  Skipping Chi-Squared test for Claim Frequency: Insufficient data or invalid contingency table.")
    elif (contingency_table_claims.sum(axis=0) == 0).any() or (contingency_table_claims.sum(axis=1) == 0).any():
        print(f"\n  Skipping Chi-Squared test for Claim Frequency: Zero marginals detected, test not applicable.")
    else:
        chi2_cf, p_value_cf, _, _ = chi2_contingency(contingency_table_claims)
        print(f"\n  Chi-Squared Test for Claim Frequency:")
        print(f"  Chi2 Statistic: {chi2_cf:.2f}")
        print(f"  P-value: {p_value_cf:.4f}")
        if p_value_cf < alpha:
            print(f"  Conclusion: Reject H0. There IS a statistically significant difference in Claim Frequency between '{group_a_val}' and '{group_b_val}'.")
            freq_a = group_a['HasClaim'].mean()
            freq_b = group_b['HasClaim'].mean()
            print(f"    Claim Frequency ({group_a_val}): {freq_a:.4f}")
            print(f"    Claim Frequency ({group_b_val}): {freq_b:.4f}")
            if freq_b < freq_a:
                print(f"    Interpretation: '{group_b_val}' has a lower claim frequency.")
            else:
                print(f"    Interpretation: '{group_b_val}' has a higher claim frequency.")
        else:
            print(f"  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Frequency between '{group_a_val}' and '{group_b_val}'.")


    # --- Hypothesis 2: Claim Severity ---
    print("\n--- Hypothesis: Claim Severity ---")
    claims_a = group_a[group_a['TotalClaims'] > 0]['TotalClaims'].dropna()
    claims_b = group_b[group_b['TotalClaims'] > 0]['TotalClaims'].dropna()

    if len(claims_a) < 2 or len(claims_b) < 2:
        print("\n  Skipping T-test for Claim Severity: Insufficient claim data in one or both groups after filtering.")
    else:
        t_stat_cs, p_value_cs = ttest_ind(claims_a, claims_b, equal_var=False) # Welch's t-test
        print(f"\n  T-Test for Claim Severity:")
        print(f"  T-Statistic: {t_stat_cs:.2f}")
        print(f"  P-value: {p_value_cs:.4f}")
        if p_value_cs < alpha:
            print(f"  Conclusion: Reject H0. There IS a statistically significant difference in Claim Severity between '{group_a_val}' and '{group_b_val}'.")
            avg_severity_a = claims_a.mean()
            avg_severity_b = claims_b.mean()
            print(f"    Avg Claim Severity ({group_a_val}): {avg_severity_a:.2f}")
            print(f"    Avg Claim Severity ({group_b_val}): {avg_severity_b:.2f}")
            if avg_severity_b < avg_severity_a:
                print(f"    Interpretation: '{group_b_val}' has lower average claim severity.")
            else:
                print(f"    Interpretation: '{group_b_val}' has higher average claim severity.")
        else:
            print(f"  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Severity between '{group_a_val}' and '{group_b_val}'.")


    # --- Hypothesis 3: Margin ---
    print("\n--- Hypothesis: Margin ---")
    group_a['Margin'] = group_a['TotalPremium'] - group_a['TotalClaims']
    group_b['Margin'] = group_b['TotalPremium'] - group_b['TotalClaims']

    margin_a = group_a['Margin'].dropna()
    margin_b = group_b['Margin'].dropna()

    if len(margin_a) < 2 or len(margin_b) < 2:
        print("\n  Skipping T-test for Margin: Insufficient data in one or both groups.")
    else:
        t_stat_margin, p_value_margin = ttest_ind(margin_a, margin_b, equal_var=False) # Welch's t-test
        print(f"\n  T-Test for Margin:")
        print(f"  T-Statistic: {t_stat_margin:.2f}")
        print(f"  P-value: {p_value_margin:.4f}")
        if p_value_margin < alpha:
            print(f"  Conclusion: Reject H0. There IS a statistically significant difference in Margin between '{group_a_val}' and '{group_b_val}'.")
            avg_margin_a = margin_a.mean()
            avg_margin_b = margin_b.mean()
            print(f"    Avg Margin ({group_a_val}): {avg_margin_a:.2f}")
            print(f"    Avg Margin ({group_b_val}): {avg_margin_b:.2f}")
            if avg_margin_b > avg_margin_a:
                print(f"    Interpretation: '{group_b_val}' yields a higher average margin.")
            else:
                print(f"    Interpretation: '{group_b_val}' yields a lower average margin.")
        else:
            print(f"  Conclusion: Fail to Reject H0. No statistically significant difference in Margin between '{group_a_val}' and '{group_b_val}'.")

# --- Run A/B Test for TrackingDevice (as per previous query) ---
# run_ab_test(df, 'TrackingDevice', 'No', 'Yes', alpha, "Tracking Device Impact")

# --- NEW SECTION: Overall Risk Differences Across Provinces ---
print("\n############################################################")
print("### Overall Risk Differences Across Provinces (Testing H0: No Risk Differences) ###")
print("############################################################")

# Hypothesis for overall provinces:
# H0: There are no statistically significant differences in Claim Frequency/Severity/Margin across all provinces.
# H1: There is a statistically significant difference in Claim Frequency/Severity/Margin across at least two provinces.

# --- Overall Claim Frequency Across Provinces ---
print("\n--- Overall Claim Frequency Across Provinces ---")
df_overall = data.copy()
df_overall['HasClaim'] = df_overall['TotalClaims'] > 0

# Create a contingency table for Province and HasClaim for the entire dataset
overall_contingency_table = pd.crosstab(df_overall['Province'], df_overall['HasClaim'])
print("\nOverall Contingency Table (Province vs. HasClaim):\n", overall_contingency_table)

if overall_contingency_table.empty or overall_contingency_table.shape[0] < 2 or overall_contingency_table.shape[1] < 2:
    print("\n  Skipping Chi-Squared test for overall Claim Frequency: Insufficient data or invalid contingency table for provinces.")
elif (overall_contingency_table.sum(axis=0) == 0).any() or (overall_contingency_table.sum(axis=1) == 0).any():
    print("\n  Skipping Chi-Squared test for overall Claim Frequency: Zero marginals detected, test not applicable.")
else:
    chi2_overall_cf, p_value_overall_cf, _, _ = chi2_contingency(overall_contingency_table)
    print(f"\n  Chi-Squared Test for Overall Claim Frequency Across Provinces:")
    print(f"  Chi2 Statistic: {chi2_overall_cf:.2f}")
    print(f"  P-value: {p_value_overall_cf:.4f}")
    if p_value_overall_cf < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Frequency across provinces.")
        print("  This suggests that claim frequency varies significantly by province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Frequency across provinces.")
        print("  This suggests that claim frequency does not vary significantly by province (based on this data and alpha).")

# --- Overall Claim Severity Across Provinces ---
print("\n--- Overall Claim Severity Across Provinces ---")
# Filter for policies with claims for overall severity analysis
claims_df_overall = data[data['TotalClaims'] > 0].copy()

# Prepare data for ANOVA - list of claim amounts for each province
province_claims_for_anova = []
provinces_with_claims_data = [] # To store names of provinces with data for ANOVA
for province in claims_df_overall['Province'].unique():
    province_data = claims_df_overall[claims_df_overall['Province'] == province]['TotalClaims'].dropna().values
    if len(province_data) > 0: # Only include provinces that actually have claims data
        province_claims_for_anova.append(province_data)
        provinces_with_claims_data.append(province)

print(f"\nProvinces with claims data for Severity ANOVA: {provinces_with_claims_data}")

if len(province_claims_for_anova) >= 2:
    f_statistic_overall_cs, p_value_overall_cs = f_oneway(*province_claims_for_anova)
    print(f"\n  One-Way ANOVA for Overall Claim Severity Across Provinces:")
    print(f"  F-Statistic: {f_statistic_overall_cs:.2f}")
    print(f"  P-value: {p_value_overall_cs:.4f}")
    if p_value_overall_cs < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Severity across provinces.")
        print("  This suggests that the average amount of a claim varies significantly by province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Severity across provinces.")
        print("  This suggests that the average amount of a claim does not vary significantly by province.")
else:
    print("\n  Skipping ANOVA for overall Claim Severity: Not enough provinces with claims data (need at least 2).")
    print("  To test this, ensure your dataset includes claims from at least two different provinces.")


# --- Overall Margin Across Provinces ---
print("\n--- Overall Margin Across Provinces ---")
df_overall['Margin'] = df_overall['TotalPremium'] - df_overall['TotalClaims']

# Prepare data for ANOVA - list of margins for each province
province_margins_for_anova = []
provinces_with_margin_data = [] # To store names of provinces with data for ANOVA
for province in df_overall['Province'].unique():
    province_data = df_overall[df_overall['Province'] == province]['Margin'].dropna().values
    if len(province_data) > 0: # Only include provinces that actually have margin data
        province_margins_for_anova.append(province_data)
        provinces_with_margin_data.append(province)

print(f"\nProvinces with margin data for Margin ANOVA: {provinces_with_margin_data}")

if len(province_margins_for_anova) >= 2:
    f_statistic_overall_margin, p_value_overall_margin = f_oneway(*province_margins_for_anova)
    print(f"\n  One-Way ANOVA for Overall Margin Across Provinces:")
    print(f"  F-Statistic: {f_statistic_overall_margin:.2f}")
    print(f"  P-value: {p_value_overall_margin:.4f}")
    if p_value_overall_margin < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Margin across provinces.")
        print("  This suggests that the average margin varies significantly by province.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Margin across provinces.")
        print("  This suggests that the average margin does not vary significantly by province.")
else:
    print("\n  Skipping ANOVA for overall Margin: Not enough provinces with margin data (need at least 2).")


# --- NEW SECTION: Overall Risk Differences Across Postal Codes ---
print("\n############################################################")
print("### Overall Risk Differences Across Postal Codes (Testing H0: No Risk Differences) ###")
print("############################################################")

# Hypothesis for overall postal codes:
# H0: There are no statistically significant differences in Claim Frequency/Severity/Margin across all postal codes.
# H1: There is a statistically significant difference in Claim Frequency/Severity/Margin across at least two postal codes.

# --- Overall Claim Frequency Across Postal Codes ---
print("\n--- Overall Claim Frequency Across Postal Codes ---")
df_overall_pc = data.copy()
df_overall_pc['HasClaim'] = df_overall_pc['TotalClaims'] > 0

# Create a contingency table for PostalCode and HasClaim for the entire dataset
overall_contingency_table_pc = pd.crosstab(df_overall_pc['PostalCode'], df_overall_pc['HasClaim'])
print("\nOverall Contingency Table (PostalCode vs. HasClaim):\n", overall_contingency_table_pc)

if overall_contingency_table_pc.empty or overall_contingency_table_pc.shape[0] < 2 or overall_contingency_table_pc.shape[1] < 2:
    print("\n  Skipping Chi-Squared test for overall Claim Frequency by PostalCode: Insufficient data or invalid contingency table.")
elif (overall_contingency_table_pc.sum(axis=0) == 0).any() or (overall_contingency_table_pc.sum(axis=1) == 0).any():
    print("\n  Skipping Chi-Squared test for overall Claim Frequency by PostalCode: Zero marginals detected, test not applicable.")
else:
    chi2_overall_pc_cf, p_value_overall_pc_cf, _, _ = chi2_contingency(overall_contingency_table_pc)
    print(f"\n  Chi-Squared Test for Overall Claim Frequency Across Postal Codes:")
    print(f"  Chi2 Statistic: {chi2_overall_pc_cf:.2f}")
    print(f"  P-value: {p_value_overall_pc_cf:.4f}")
    if p_value_overall_pc_cf < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Frequency across postal codes.")
        print("  This suggests that claim frequency varies significantly by postal code.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Frequency across postal codes.")
        print("  This suggests that claim frequency does not vary significantly by postal code (based on this data and alpha).")

# --- Overall Claim Severity Across Postal Codes ---
print("\n--- Overall Claim Severity Across Postal Codes ---")
# Filter for policies with claims for overall severity analysis
claims_df_overall_pc = data[data['TotalClaims'] > 0].copy()

# Prepare data for ANOVA - list of claim amounts for each postal code
pc_claims_for_anova = []
postal_codes_with_claims_data = []
for pc in claims_df_overall_pc['PostalCode'].unique():
    pc_data = claims_df_overall_pc[claims_df_overall_pc['PostalCode'] == pc]['TotalClaims'].dropna().values
    if len(pc_data) > 0:
        pc_claims_for_anova.append(pc_data)
        postal_codes_with_claims_data.append(pc)

print(f"\nPostal Codes with claims data for Severity ANOVA: {postal_codes_with_claims_data}")

if len(pc_claims_for_anova) >= 2:
    f_statistic_overall_pc_cs, p_value_overall_pc_cs = f_oneway(*pc_claims_for_anova)
    print(f"\n  One-Way ANOVA for Overall Claim Severity Across Postal Codes:")
    print(f"  F-Statistic: {f_statistic_overall_pc_cs:.2f}")
    print(f"  P-value: {p_value_overall_pc_cs:.4f}")
    if p_value_overall_pc_cs < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Claim Severity across postal codes.")
        print("  This suggests that the average amount of a claim varies significantly by postal code.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Claim Severity across postal codes.")
        print("  This suggests that the average amount of a claim does not vary significantly by postal code.")
else:
    print("\n  Skipping ANOVA for overall Claim Severity by PostalCode: Not enough postal codes with claims data (need at least 2).")


# --- Overall Margin Across Postal Codes ---
print("\n--- Overall Margin Across Postal Codes ---")
df_overall_pc['Margin'] = df_overall_pc['TotalPremium'] - df_overall_pc['TotalClaims']

# Prepare data for ANOVA - list of margins for each postal code
pc_margins_for_anova = []
postal_codes_with_margin_data = []
for pc in df_overall_pc['PostalCode'].unique():
    pc_data = df_overall_pc[df_overall_pc['PostalCode'] == pc]['Margin'].dropna().values
    if len(pc_data) > 0:
        pc_margins_for_anova.append(pc_data)
        postal_codes_with_margin_data.append(pc)

print(f"\nPostal Codes with margin data for Margin ANOVA: {postal_codes_with_margin_data}")

if len(pc_margins_for_anova) >= 2:
    f_statistic_overall_pc_margin, p_value_overall_pc_margin = f_oneway(*pc_margins_for_anova)
    print(f"\n  One-Way ANOVA for Overall Margin Across Postal Codes:")
    print(f"  F-Statistic: {f_statistic_overall_pc_margin:.2f}")
    print(f"  P-value: {p_value_overall_pc_margin:.4f}")
    if p_value_overall_pc_margin < alpha:
        print("  Conclusion: Reject H0. There IS a statistically significant difference in Margin across postal codes.")
        print("  This suggests that the average margin varies significantly by postal code.")
    else:
        print("  Conclusion: Fail to Reject H0. No statistically significant difference in Margin across postal codes.")
        print("  This suggests that the average margin does not vary significantly by postal code.")
else:
    print("\n  Skipping ANOVA for overall Margin by PostalCode: Not enough postal codes with margin data (need at least 2).")


# --- NEW SECTION: A/B Test for Gender ---
run_ab_test(data, 'Gender', 'F', 'M', alpha, "Gender Impact")


############################################################
### Overall Risk Differences Across Provinces (Testing H0: No Risk Differences) ###
############################################################

--- Overall Claim Frequency Across Provinces ---

Overall Contingency Table (Province vs. HasClaim):
 HasClaim        False  True 
Province                    
Eastern Cape    30306     30
Free State       8095      4
Gauteng        393161    704
KwaZulu-Natal  169588    193
Limpopo         24788     48
Mpumalanga      52625     93
North West     143058    229
Northern Cape    6375      5
Western Cape   170625    171

  Chi-Squared Test for Overall Claim Frequency Across Provinces:
  Chi2 Statistic: 85.44
  P-value: 0.0000
  Conclusion: Reject H0. There IS a statistically significant difference in Claim Frequency across provinces.
  This suggests that claim frequency varies significantly by province.

--- Overall Claim Severity Across Provinces ---

Provinces with claims data for S

In [10]:

# Ensure TotalClaims and TotalPremium are numeric
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce').fillna(0)
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce').fillna(0)

**Exclude identifiers and target variables from features for modeling**

In [None]:
IDENTIFIER_COLS = ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'VehicleIntroDate']
TARGET_CLAIM_SEVERITY = 'TotalClaims'
TARGET_CLAIM_PROBABILITY = 'HasClaim' # Will be engineered
TARGET_PREMIUM = 'CalculatedPremiumPerTerm' # Naive premium prediction target

In [11]:
data['TransactionMonth'] = pd.to_datetime(data['TransactionMonth'])