In [1]:
# 1. Imports & Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")
sns.set_style("whitegrid")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2. Load Dataset
df = pd.read_csv('../data/MachineLearningRating_v3.txt',sep="|")
df.head()


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [None]:
# 3. Data Cleaning & Feature Engineering

# Example: create claim indicator
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# Example: Loss Ratio
df['LossRatio'] = df['TotalClaims'] / (df['TotalPremium'] + 1)

# Drop ID or irrelevant columns if any
df.drop(columns=['PolicyID', 'ClientID'], errors='ignore', inplace=True)

# Encode categorical features
cat_cols = df.select_dtypes(include='object').columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df.fillna(0, inplace=True)  # or use smarter imputation


In [None]:
# 4. Train/Test Split
df_claims = df[df['TotalClaims'] > 0]  # For severity model

# Split for Claim Severity Prediction
X_severity = df_claims.drop(['TotalClaims'], axis=1)
y_severity = df_claims['TotalClaims']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_severity, y_severity, test_size=0.2, random_state=42)

# Split for Premium Prediction
X_premium = df.drop(['CalculatedPremiumPerTerm'], axis=1)
y_premium = df['CalculatedPremiumPerTerm']
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_premium, y_premium, test_size=0.2, random_state=42)

# Split for Classification
X_class = df.drop(['HasClaim'], axis=1)
y_class = df['HasClaim']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)


In [None]:
# 5. Claim Severity Prediction

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": xgb.XGBRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train_s, y_train_s)
    preds = model.predict(X_test_s)
    rmse = np.sqrt(mean_squared_error(y_test_s, preds))
    r2 = r2_score(y_test_s, preds)
    print(f"{name}: RMSE = {rmse:.2f}, R2 = {r2:.2f}")


In [None]:
# 6. SHAP for Best Model (e.g., XGBoost on Severity)
explainer = shap.Explainer(models["XGBoost"])
shap_values = explainer(X_test_s)

shap.summary_plot(shap_values, X_test_s, plot_type="bar")


In [None]:
# 7. Premium Prediction

for name, model in models.items():
    model.fit(X_train_p, y_train_p)
    preds = model.predict(X_test_p)
    rmse = np.sqrt(mean_squared_error(y_test_p, preds))
    r2 = r2_score(y_test_p, preds)
    print(f"{name} (Premium): RMSE = {rmse:.2f}, R2 = {r2:.2f}")


In [None]:
# 8. Claim Classification

clf_models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier(random_state=42)
}

for name, model in clf_models.items():
    model.fit(X_train_c, y_train_c)
    preds = model.predict(X_test_c)
    probas = model.predict_proba(X_test_c)[:,1]
    
    acc = accuracy_score(y_test_c, preds)
    f1 = f1_score(y_test_c, preds)
    roc = roc_auc_score(y_test_c, probas)

    print(f"{name} (Classification): Accuracy = {acc:.2f}, F1 = {f1:.2f}, ROC-AUC = {roc:.2f}")


In [None]:
# 9. Conclusion & Recommendations

# Example business interpretation
print("""
Claim severity varies significantly by vehicle age and region.
Older vehicles show higher expected claim costs (SHAP + model support).
Premiums should be increased for older cars and high-loss regions (e.g., Gauteng).
Classification model can be used to offer risk-based pricing: 
  Premium = P(Claim) * E[ClaimAmount] + Loadings
""")
