In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("../data/MachineLearningRating_v3.csv", on_bad_lines='skip')

  df = pd.read_csv("../data/MachineLearningRating_v3.csv", on_bad_lines='skip')


### 1. Select Metrics
We’ll use these metrics to quantify risk and margin:

* Claim Frequency = NumClaims > 0 (binary outcome per policy)
* Claim Severity = TotalClaims / NumClaims for policies where NumClaims > 0
* Margin = TotalPremium - TotalClaims

In [4]:
import pandas as pd

# Load data
df['HasClaim'] = df['TotalClaims'] > 0
df['Margin'] = df['TotalPremium'] - df['TotalClaims']
df['NumClaims'] = (df['TotalClaims'] > 0).astype(int)
df['ClaimSeverity'] = df['TotalClaims'] / df['NumClaims']
df['ClaimSeverity'].replace([float('inf'), -float('inf')], 0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ClaimSeverity'].replace([float('inf'), -float('inf')], 0, inplace=True)


### 2. Data Segmentation & Testing Strategy
We test the following null hypotheses:

H₀.1: No risk difference across provinces
* Metric: Claim Frequency & Severity
* Test: ANOVA (more than 2 provinces) or Kruskal-Wallis if not normal

In [5]:
import scipy.stats as stats

# Drop missing values and group by province
grouped = df.dropna(subset=['Province', 'HasClaim']).groupby('Province')['HasClaim']

# Perform ANOVA
anova_result = stats.f_oneway(*[group for name, group in grouped])
print("ANOVA p-value (Claim Frequency across provinces):", anova_result.pvalue)


ANOVA p-value (Claim Frequency across provinces): 1.1584241237552373e-17


H₀.2: No risk difference between zip codes
You may need to sample 2 zip codes with large enough records

* Metric: Claim Frequency
* Test: Chi-square or t-test

In [7]:
zips = df['PostalCode'].value_counts().head(2).index.tolist()
zip_data = df[df['PostalCode'].isin(zips)]

group1 = zip_data[zip_data['PostalCode'] == zips[0]]['HasClaim']
group2 = zip_data[zip_data['PostalCode'] == zips[1]]['HasClaim']

t_test_result = stats.ttest_ind(group1, group2, equal_var=False)
print("T-test p-value (Claim Frequency across zip codes):", t_test_result.pvalue)


T-test p-value (Claim Frequency across zip codes): nan


H₀.3: No significant margin difference between zip codes
* Metric: Margin
* Test: T-test

In [9]:
group1 = zip_data[zip_data['PostalCode'] == zips[0]]['Margin']
group2 = zip_data[zip_data['PostalCode'] == zips[1]]['Margin']

t_test_margin = stats.ttest_ind(group1, group2, equal_var=False)
print("T-test p-value (Margin across zip codes):", t_test_margin.pvalue)


T-test p-value (Margin across zip codes): 0.25191600620729687


H₀.4: No significant risk difference between Women and Men
* Metric: Claim Frequency & Severity
* Test: T-test or Chi-square

In [10]:
df['Gender'] = df['Title'].map(lambda x: 'Male' if x in ['Mr'] else 'Female')
gender_group = df[df['Gender'].isin(['Male', 'Female'])]

male_claims = gender_group[gender_group['Gender'] == 'Male']['HasClaim']
female_claims = gender_group[gender_group['Gender'] == 'Female']['HasClaim']

t_test_gender = stats.ttest_ind(male_claims, female_claims, equal_var=False)
print("T-test p-value (Claim Frequency by Gender):", t_test_gender.pvalue)


T-test p-value (Claim Frequency by Gender): nan
