In [4]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import scipy.stats as stats

In [2]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname("../scripts"))))

In [3]:
from scripts.missing_value import handle_missing_values

In [4]:
# Load data
data = pd.read_csv("../data/MachineLearningRating_v3.csv")  

# Preview the data
print(data.head())

# Handle missing values (if needed)
  # Or use imputation techniques



  data = pd.read_csv("../data/MachineLearningRating_v3.csv")


   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  ...             Mobility - 

In [5]:
# Create new features for margin and risk ratio
data['Margin'] = data['TotalPremium'] - data['TotalClaims']
data['RiskRatio'] = data['TotalClaims'] / (data['TotalPremium'] + 1e-9)  # Avoid division by zero


In [6]:
print(data['Province'].value_counts())
print(data['PostalCode'].value_counts())
print(data['Gender'].value_counts())

Province
Gauteng          393865
Western Cape     170796
KwaZulu-Natal    169781
North West       143287
Mpumalanga        52718
Eastern Cape      30336
Limpopo           24836
Free State         8099
Northern Cape      6380
Name: count, dtype: int64
PostalCode
2000    133498
122      49171
7784     28585
299      25546
7405     18518
         ...  
284          1
7560         1
2210         1
3655         1
6655         1
Name: count, Length: 888, dtype: int64
Gender
Not specified    940990
Male              42817
Female             6755
Name: count, dtype: int64


In [10]:
# Data Segmentation

# Filter out relevant columns (You can also do this in the previous pre-processing step)
data['TotalPremium'] = pd.to_numeric(data['TotalPremium'], errors='coerce')
data['TotalClaims'] = pd.to_numeric(data['TotalClaims'], errors='coerce')

# Drop rows with NaN values in our metrics of interest
data.dropna(subset=['TotalPremium', 'TotalClaims'], inplace=True)

# Create groups for Province
province_a = data[data['Province'] == 'Gauteng']  # Change to actual province name
province_b = data[data['Province'] == 'Western Cape']  # Change to actual province name

# For Gender Comparison
gender_men = data[data['Gender'] == 'Male']
gender_women = data[data['Gender'] == 'Female']

In [11]:
#Statistical Testing
#Risk Differences Across Provinces:
# T-test for TotalClaims across provinces
t_stat, p_value = stats.ttest_ind(province_a['TotalClaims'], province_b['TotalClaims'], equal_var=False)
print("T-test between provinces:")
print("t-statistic:", t_stat)
print("p-value:", p_value)

# Hypothesis
if p_value < 0.05:
    print("Reject null hypothesis: There are risk differences across provinces.")
else:
    print("Fail to reject null hypothesis: No risk differences across provinces.")

T-test between provinces:
t-statistic: 1.8652143496485993
p-value: 0.06215231452280004
Fail to reject null hypothesis: No risk differences across provinces.


In [12]:
#Risk Differences Between Zip Codes:
# Create zip code groups
zip_a = data[data['PostalCode'] == '2000']  # Replace with actual zip code
zip_b = data[data['PostalCode'] == '122']

# T-test for TotalClaims
t_stat, p_value = stats.ttest_ind(zip_a['TotalClaims'], zip_b['TotalClaims'], equal_var=False)
print("T-test between zip codes:")
print("t-statistic:", t_stat)
print("p-value:", p_value)

T-test between zip codes:
t-statistic: nan
p-value: nan


  t_stat, p_value = stats.ttest_ind(zip_a['TotalClaims'], zip_b['TotalClaims'], equal_var=False)


In [13]:
#Margin Differences Between Zip Codes:
zip_a['Margin'] = zip_a['TotalPremium'] - zip_a['TotalClaims']
zip_b['Margin'] = zip_b['TotalPremium'] - zip_b['TotalClaims']

# T-test for Margin
t_stat, p_value = stats.ttest_ind(zip_a['Margin'], zip_b['Margin'], equal_var=False)
print("T-test for margin between zip codes:")
print("t-statistic:", t_stat)
print("p-value:", p_value)

T-test for margin between zip codes:
t-statistic: nan
p-value: nan


  t_stat, p_value = stats.ttest_ind(zip_a['Margin'], zip_b['Margin'], equal_var=False)


In [None]:
print(zip_a['Margin'].count())  # Number of valid entries
print(zip_b['Margin'].count())  # Number of valid entries

In [14]:
#Risk Differences Between Women and Men:

t_stat, p_value = stats.ttest_ind(gender_men['TotalClaims'], gender_women['TotalClaims'], equal_var=False)
print("T-test between genders:")
print("t-statistic:", t_stat)
print("p-value:", p_value)

# Hypothesis
if p_value < 0.05:
    print("Reject null hypothesis: There are significant risk differences between women and men.")
else:
    print("Fail to reject null hypothesis: No significant risk differences between women and men.")

T-test between genders:
t-statistic: -0.296353891400699
p-value: 0.7669656471629474
Fail to reject null hypothesis: No significant risk differences between women and men.


In [5]:
# Load data
data = pd.read_csv("../data/MachineLearningRating_v3.csv")

  data = pd.read_csv("../data/MachineLearningRating_v3.csv")


In [None]:
#Statistical Modeling

# Handling Missing Data
data.fillna(data.mean(), inplace=True)  # Impute missing values with the mean for numeric cols

# Feature Engineering: Create Margin feature
data['Margin'] = data['TotalPremium'] - data['TotalClaims']

# Encoding Categorical Data
data = pd.get_dummies(data, columns=['Citizenship', 'LegalType', 'Title', 'Language','Bank', 'AccountType', 'Province'], drop_first=True)

# Train-Test Split
from sklearn.model_selection import train_test_split

X = data.drop(['TotalPremium', 'TotalClaims', 'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth'], axis=1)
y = data['TotalPremium']  # Example target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Model Building

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# XGBoost
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [None]:
# Model Evaluation

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Function to evaluate models
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, mae, r2

# Evaluate each model
lr_metrics = evaluate_model(y_test, lr_pred)
rf_metrics = evaluate_model(y_test, rf_pred)
xgb_metrics = evaluate_model(y_test, xgb_pred)

print("Linear Regression:", lr_metrics)
print("Random Forest:", rf_metrics)
print("XGBoost:", xgb_metrics)

In [None]:
#  Feature Importance Analysis and Interpretation

# Feature Importance Analysis
importances = rf.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

# Print feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
    print(f"{f + 1}. feature {features[indices[f]]} ({importances[indices[f]]})")