In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
# Load the dataset
data = pd.read_csv('../data/data_cleaned.csv', low_memory=False)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999544 entries, 0 to 999543
Data columns (total 51 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   UnderwrittenCoverID       999544 non-null  int64  
 1   PolicyID                  999544 non-null  int64  
 2   TransactionMonth          999544 non-null  object 
 3   IsVATRegistered           999544 non-null  bool   
 4   Citizenship               999544 non-null  object 
 5   LegalType                 999544 non-null  object 
 6   Title                     999544 non-null  object 
 7   Language                  999544 non-null  object 
 8   AccountType               999544 non-null  object 
 9   MaritalStatus             999544 non-null  object 
 10  Gender                    999544 non-null  object 
 11  Country                   999544 non-null  object 
 12  Province                  999544 non-null  object 
 13  PostalCode                999544 non-null  i

In [19]:
data.columns

Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'AlarmImmobiliser', 'TrackingDevice',
       'CapitalOutstanding', 'NewVehicle', 'SumInsured', 'TermFrequency',
       'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory',
       'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass',
       'StatutoryRiskType', 'TotalPremium', 'TotalClaims', 'log_cubiccapacity',
       'log_kilowatts', 'sqrt_mmcode', 'scaled_log_cubiccapacity',
       'scaled_log_kilowatts', 'scaled_sqrt_mmcode'],
      dtype='object')

A good measure of risk is the TotalClaims column, which represents the total claims made per policy.

In [20]:
data['ProfitMargin'] = data['TotalPremium'] - data['TotalClaims']

### Data Segmentation

#### Provinces

In [21]:
group_province_a = data[data['Province'] == 'Gauteng']
group_province_b = data[data['Province'] == 'KwaZulu-Natal']

#### Gender

In [22]:
group_male = data[data['Gender'] == 'Male']
group_female = data[data['Gender'] == 'Female']

### Statistical Testing

#### t-test for Risk Differences Across Provinces

In [23]:
from scipy.stats import ttest_ind

# Perform a t-test
t_stat, p_value = ttest_ind(group_province_a['TotalClaims'], group_province_b['TotalClaims'])

print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -1.2957062182938806, P-value: 0.19507724904787543


#### t-test for Risk Differences Between Men and Women:

In [24]:
# Perform a t-test
t_stat, p_value = ttest_ind(group_male['TotalClaims'], group_female['TotalClaims'])

print(f"T-statistic: {t_stat}, P-value: {p_value}")

T-statistic: -0.24803623812388725, P-value: 0.8041073961270343


In [27]:
from scipy.stats import f_oneway

# Get unique zip codes
unique_zip_codes = data['PostalCode'].unique()

# Group data by zip code
groups_zip = [data[data['PostalCode'] == zip_code]['TotalClaims'] for zip_code in unique_zip_codes]

# Perform one-way ANOVA test
f_stat, p_value = f_oneway(*groups_zip)

print(f"F-statistic: {f_stat}, P-value: {p_value}")


F-statistic: 0.9409362422651795, P-value: 0.8948588787098132


In [29]:
# Group by PostalCode and calculate average ProfitMargin
grouped = data.groupby('PostalCode')['ProfitMargin'].apply(list)

In [30]:
# Perform ANOVA test
f_statistic, p_value = f_oneway(*grouped)

In [34]:
print(f_statistic)
print(p_value)

0.8754317030012804
0.996773665457904
