In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.hypothesis_testing import *

In [2]:
data = pd.read_csv('../data/MachineLearningRating_v3.csv', low_memory=False, index_col=False) 

## 1. Test for risk differences across provinces

### Null Hypothesis (H₀): There are no risk differences across provinces (interms of TotalPremium)
### Alternative Hypothesis (H₁): There is risk differences across provinces

In [3]:
data['Margin'] = data['TotalPremium'] - data['TotalClaims']

In [4]:
def print_test_results(result, risks):
    if 'error' in result:
        print(result['error'])
    else:
        print(f"Test type: {result['test_type']}")
        print(f"Statistic: {result['statistic']}")
        print(f"p-value: {result['p_value']}")
        print(result['interpretation'])
    print(f"Risks:\n{risks}\n")

In [5]:
print("1. Testing for risk differences across provinces")
province_risks = calculate_risk(data, 'Province', 'TotalPremium')
result = perform_statistical_test(data, 'Province', 'TotalPremium', 'anova')
print_test_results(result, province_risks)

1. Testing for risk differences across provinces
Test type: anova
Statistic: 157.27016602047266
p-value: 3.8358007302190926e-266
Reject the null hypothesis (p-value: 0.0000). There is a significant difference.
Risks:
Province
Eastern Cape     70.546672
Free State       64.373780
Gauteng          61.071115
KwaZulu-Natal    77.800695
Limpopo          61.899038
Mpumalanga       53.801205
North West       52.276258
Northern Cape    49.617253
Western Cape     57.416793
Name: TotalPremium, dtype: float64



Reject the null hypothesis. There are risk differences across provinces.

## 2. Test for risk differences between zip codes

### Null Hypothesis (H₀): TThere are no risk differences between zip codes(interms of TotalPremium)
### Alternative Hypothesis (H₁): There is risk differences between zip codes

In [6]:
print("2. Testing for risk differences between zipcodes")
zipcode_risks = calculate_risk(data, 'PostalCode', 'TotalPremium')
result = perform_statistical_test(data, 'PostalCode', 'TotalPremium', 'anova')
print_test_results(result, zipcode_risks.nlargest(5))

2. Testing for risk differences between zipcodes
Test type: anova
Statistic: 10.811115758352543
p-value: 0.0
Reject the null hypothesis (p-value: 0.0000). There is a significant difference.
Risks:
PostalCode
3887    196.635975
4016    195.716263
9744    175.104079
3802    172.142169
3740    171.417242
Name: TotalPremium, dtype: float64



## 3. Test for Risk Differences Between Women and Men
### Null Hypothesis (H₀): There is no significant difference in risk between males and females in terms of Total Premium.
### Alternative Hypothesis (H₁): There is a significant difference in risk between males and females.

Reject the null hypothesis. There are risk differences between zip codes.

In [7]:
print("4. Testing for risk differences between Women and Men")
filtered_data = data[data['Gender'].isin(['Male', 'Female'])]
gender_risks = calculate_risk(filtered_data, 'Gender', 'TotalPremium')
result = perform_statistical_test(filtered_data, 'Gender', 'TotalPremium', 't_test')
print_test_results(result, gender_risks)

4. Testing for risk differences between Women and Men
Test type: t_test
Statistic: -5.118420932688848
p-value: 3.0925282750010697e-07
Reject the null hypothesis (p-value: 0.0000). There is a significant difference.
Risks:
Gender
Female    45.074841
Male      36.904566
Name: TotalPremium, dtype: float64



Reject the null hypothesis. There is a significant difference in risk between males and females