In [3]:
import sys
import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import shap
import matplotlib.pyplot as plt

sys.path.append(os.path.abspath('../scripts'))
sys.path.append(os.path.abspath('../src')) 

from load_data import DataLoader
import path 

from Hypothesis_Analysis import hypothesis
from Data_preparation import DataPreparation
from model_deve import Modeling
from feature_importance import FeatureImportance


In [None]:
# Loading and reading text file using pandas
csv_path = path.get_clead_data()
data_load = DataLoader(csv_path)
df = data_load.load_csv_data()

In [4]:
df = pd.read_csv('../../Data/cleaned_insurance_data.csv', low_memory=False)

In [5]:
hypoth = hypothesis(df)

## A/B Hypothesis Testing
For this analysis, "risk" will be quantified by two metrics: Claim Frequency (proportion of policies with at least one claim) and Claim Severity (the average amount of a claim, given a claim occurred). "Margin" is defined as (TotalPremium - TotalClaims).


In [None]:
# Performs ANOVA test across groups. Filters out any groups with only one unique value.
 #       Returns p-value or warning message.

In [9]:
# Convert TotalPremium and TotalClaims to numeric
df["TotalPremium"] = pd.to_numeric(df["TotalPremium"], errors="coerce")
df["TotalClaims"] = pd.to_numeric(df["TotalClaims"], errors="coerce")

In [10]:
# Create new columns
df["HasClaim"] = df["TotalClaims"] > 0
df["Margin"] = df["TotalPremium"] - df["TotalClaims"]

### 1. ANOVA: Risk differences across Provinces (Claim Frequency)

In [11]:
from scipy.stats import ttest_ind, f_oneway
province_groups = [group["HasClaim"].astype(int) for _, group in df.groupby("Province")]
anova_province_risk = f_oneway(*province_groups)

In [12]:
anova_province_risk.pvalue

np.float64(5.91210036318191e-19)

In [13]:
# 2. ANOVA: Risk differences between Zip Codes (Claim Frequency)
zip_groups = [group["HasClaim"].astype(int) for _, group in df.groupby("PostalCode")]
anova_zip_risk = f_oneway(*zip_groups)

In [14]:
# 3. ANOVA: Margin differences between Zip Codes
margin_groups = [group["Margin"].dropna() for _, group in df.groupby("PostalCode")]
anova_zip_margin = f_oneway(*margin_groups)

### Performs T-test between Male and Female on claim risk

In [15]:
# 4. T-test: Risk differences by Gender
men = df[df["Gender"] == "Male"]["HasClaim"].astype(int)
women = df[df["Gender"] == "Female"]["HasClaim"].astype(int)
ttest_gender_risk = ttest_ind(men, women, nan_policy="omit")

In [16]:
# Collect results
results = {
    "Province Risk ANOVA p-value": anova_province_risk.pvalue,
    "Zip Risk ANOVA p-value": anova_zip_risk.pvalue,
    "Zip Margin ANOVA p-value": anova_zip_margin.pvalue,
    "Gender Risk T-test p-value": ttest_gender_risk.pvalue
}

results

{'Province Risk ANOVA p-value': np.float64(5.91210036318191e-19),
 'Zip Risk ANOVA p-value': np.float64(2.9076595484940585e-30),
 'Zip Margin ANOVA p-value': np.float64(0.9976859758015036),
 'Gender Risk T-test p-value': np.float64(0.8404980845002314)}

### To perform controlled A/B testing on insurance plan features (e.g., AlarmImmobiliser, TrackingDevice, CoverType, etc.), we need to follow a clear and reproducible approach.

In [18]:
hypoth.compute_kpis()

Unnamed: 0.1,Unnamed: 0,TransactionMonth,IsVATRegistered,Citizenship,MaritalStatus,Gender,Province,PostalCode,VehicleType,RegistrationYear,...,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,HasClaim,Margin,ClaimSeverity
0,0,2015-03-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,21.929825,0.0
1,1,2015-05-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,0,21.929825,0.0
2,2,2015-07-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.000000,0.0,0,0.000000,0.0
3,3,2015-05-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.848070,0.0,0,512.848070,0.0
4,4,2015-07-01 00:00:00,True,,Not specified,Not specified,Gauteng,1459,Passenger Vehicle,2004,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.000000,0.0,0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000093,1000093,2015-04-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,347.235175,0.0,0,347.235175,0.0
1000094,1000094,2015-06-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,347.235175,0.0,0,347.235175,0.0
1000095,1000095,2015-08-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,347.235175,0.0,0,347.235175,0.0
1000096,1000096,2014-07-01 00:00:00,False,ZW,Single,Male,Western Cape,7493,Passenger Vehicle,2013,...,Comprehensive - Taxi,Motor Comprehensive,Mobility Commercial Cover: Monthly,Commercial,IFRS Constant,2.315000,0.0,0,2.315000,0.0


In [13]:
group_a, group_b = hypoth.create_ab_groups("Gender", "Male", "Female")

In [14]:
p_value = hypoth.compare_kpis(group_a, group_b, "Margin")
print("P-value for Margin difference between Male and Female:", p_value)

P-value for Margin difference between Male and Female: 0.8015464193501282


In [15]:
results, size_a, size_b = hypoth.run_ab_test("Gender", "Male", "Female")

In [16]:
print(f"Group Sizes -> A: {size_a}, B: {size_b}")
print("P-Values for KPIs:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

Group Sizes -> A: 42817, B: 6755
P-Values for KPIs:
HasClaim (p-value): 0.8372
ClaimSeverity (p-value): 0.7670
Margin (p-value): 0.8015


###	Statistical Testing 

In [None]:
hypoth.perform_t_test()

In [None]:
hypoth.perform_chi_squared()

In [20]:
# Example 1: T-Test between Male and Female on all KPIs
results_gender = hypoth.run_all_tests(feature="Gender", group_a_val="Male", group_b_val="Female")
print(results_gender)

     Test Feature Group A Group B            KPI   P-Value Significant
0  T-Test  Gender    Male  Female       HasClaim  0.837187          No
1  T-Test  Gender    Male  Female  ClaimSeverity  0.766966          No
2  T-Test  Gender    Male  Female         Margin  0.801546          No


In [21]:
# 2: Chi-Squared test for Province effect on claim frequency
results_province = hypoth.run_all_tests(feature="Province")
print(results_province)

          Test   Feature       KPI       P-Value Significant
0  Chi-Squared  Province  HasClaim  5.925511e-19         Yes


In [22]:
# 2: Chi-Squared test for Province effect on claim frequency
results_PostalCode = hypoth.run_all_tests(feature="PostalCode")
print(results_PostalCode)

          Test     Feature       KPI       P-Value Significant
0  Chi-Squared  PostalCode  HasClaim  3.152172e-30         Yes


# Statistical Modeling
### Data Preparation

In [19]:
df.isna().sum()

Unnamed: 0                  0
TransactionMonth            0
IsVATRegistered             0
Citizenship                 0
MaritalStatus               0
Gender                      0
Province                    0
PostalCode                  0
VehicleType                 0
RegistrationYear            0
make                        0
Model                       0
Cylinders                   0
cubiccapacity               0
kilowatts                   0
bodytype                    0
NumberOfDoors               0
VehicleIntroDate            0
AlarmImmobiliser            0
TrackingDevice              0
CapitalOutstanding          0
SumInsured                  0
TermFrequency               0
CalculatedPremiumPerTerm    0
ExcessSelected              0
CoverCategory               0
CoverType                   0
CoverGroup                  0
Section                     0
Product                     0
StatutoryClass              0
StatutoryRiskType           0
TotalPremium                0
TotalClaim

In [6]:
# Convert the column to datetime format
df["TransactionMonth"] = pd.to_datetime(df["TransactionMonth"])

In [7]:
df = df.drop("Unnamed: 0", axis=1)

In [8]:
df = df.drop(columns=['Citizenship', 'MaritalStatus', 'Gender', 'make','CoverType', 'CoverGroup', 'Section', 'Model', 'TermFrequency',
                       'StatutoryClass', 'CapitalOutstanding', 'ExcessSelected', 'CoverCategory', 'StatutoryRiskType', 
                       'bodytype' ], axis=1)

In [None]:
# Automatically detect categorical (object or category) columns
categorical_cols = df.select_dtypes(include=["object", "category"]).columns

# Count values for each categorical column
for col in categorical_cols:
    print(f"\nValue counts for '{col}':")
    print(df[col].value_counts())


Value counts for 'Province':
Province
Gauteng          393865
Western Cape     170796
KwaZulu-Natal    169781
North West       143287
Mpumalanga        52718
Eastern Cape      30336
Limpopo           24836
Free State         8099
Northern Cape      6380
Name: count, dtype: int64

Value counts for 'VehicleType':
VehicleType
Passenger Vehicle    934150
Medium Commercial     53985
Heavy Commercial       7401
Light Commercial       3897
Bus                     665
Name: count, dtype: int64

Value counts for 'AlarmImmobiliser':
AlarmImmobiliser
Yes    999861
No        237
Name: count, dtype: int64

Value counts for 'TrackingDevice':
TrackingDevice
No     656617
Yes    343481
Name: count, dtype: int64

Value counts for 'Product':
Product
Mobility Commercial Cover: Monthly    915028
Mobility Metered Taxis: Monthly        79272
Bridge Taxi Finance: Monthly            5254
Standalone Passenger Liability           544
Name: count, dtype: int64


In [9]:

preparation = DataPreparation()
model = Modeling()
plot = FeatureImportance()

In [10]:
# Imputes missing numerical and categorical values.
df = preparation.handle_missing_data(df)
df.isna().sum()[df.isna().sum() > 0]

Series([], dtype: int64)

In [11]:
# Create new features relevant to TotalPremium and TotalClaims
df = preparation.feature_engineering(df)

  df['VehicleIntroDate'] = pd.to_datetime(df['VehicleIntroDate'])


In [12]:
df.shape

(1000098, 21)

### Encoding Categorical Data: Convert categorical data into a numeric format using one-hot encoding or label encoding to make it suitable for modeling.

In [13]:
# One-hot encodes categorical variables.
df = preparation.categorical_encoding(df)
df.shape

(1000098, 33)

In [14]:
## Splits the dataset into training and testing sets

X_train, X_test, y_train, y_test  = preparation.Train_Test_Split(df, 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800078, 31) (200020, 31) (800078,) (200020,)


# Modeling Techniques
## Linear Regression

In [15]:
lin_reg = model.linear_regression(X_train, y_train)
y_pred = model.model_performamnce(lin_reg, X_test, y_test)

Mean Squared Error: 1963853.076117056
Mean Absolute Error: 79.01400897091249
Root Mean Squared Error: 1401.3754229745346
R-squared: 0.5980971131827577


## Decision Trees

In [16]:
dt_reg = model.decision_tree(X_train, y_train)
y_pred_dt = model.model_performamnce(dt_reg, X_test, y_test)

Mean Squared Error: 157046.14516155
Mean Absolute Error: 4.4596627574084495
Root Mean Squared Error: 396.29048078593814
R-squared: 0.967860478020717


## Random Forests

In [None]:
rf_reg = model.random_forest(X_train, y_train)
y_pred_rf = model.model_performamnce(rf_reg, X_test, y_test)

## Gradient Boosting

In [33]:
xgb_reg = model.XGBRegressor_model(X_train, y_train)
y_pred_xgb = model.model_performamnce(xgb_reg, X_test, y_test)

Mean Squared Error: 6089800.217714862
Mean Absolute Error: 150.15704223269717
Root Mean Squared Error: 2467.7520575849717
R-squared: -0.24627871473924334


## Feature Importance Analysis
Analyze which features are most influential in predicting retention.

In [None]:
# Define the models
models = [
    ("Linear Regression", lin_reg), 
    # ("Decision Tree Regressor", dt_reg), 
    # ("Random Forest Regressor", rf_reg), 
    # ("XGBoost Regressor", xgb_reg)
]

# Example usage:
plot.plot_feature_importance(models, X_train)

In [None]:
plot.feature_importance(xgb_reg, X_train, "'XGBoost Feature Importance'")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature importance from Random Forest model
feat_importance_rf = rf_reg.feature_importances_
feat_names = X_train.columns

# Create a DataFrame for easier sorting
feature_importance_df = pd.DataFrame({
    'Feature': feat_names,
    'Importance': feat_importance_rf
})

# Sort by importance in descending order and get the top 30 features
top_10_features = feature_importance_df.sort_values(by='Importance', ascending=False)[1:10]

# Plot the top 30 features
plt.figure(figsize=(10, 8))
plt.barh(top_10_features['Feature'], top_10_features['Importance'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Random Forest Feature Importance')
plt.gca().invert_yaxis()  
plt.show()

In [None]:
# Get feature importance from Random Forest model
feat_importance_rf = dt_reg.feature_importances_
feat_names = X_train.columns

# Create a DataFrame for easier sorting
feature_importance_df = pd.DataFrame({
    'Feature': feat_names,
    'Importance': feat_importance_rf
})

# Sort by importance in descending order and get the top 30 features
top_10_features = feature_importance_df.sort_values(by='Importance', ascending=False)[1:10]

# Plot the top 30 features
plt.figure(figsize=(10, 8))
plt.barh(top_10_features['Feature'], top_10_features['Importance'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Linear regression Feature Importance')
plt.gca().invert_yaxis()  
plt.show()