In [3]:
pip install pandas numpy scikit-learn xgboost shap matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install ipython ipywidgets


Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
import shap
import matplotlib.pyplot as plt
import seaborn as sns


In [9]:
df = pd.read_csv("../data/MachineLearningRating_v3.csv", on_bad_lines='skip')

# Drop rows with missing TotalClaims or TotalPremium
df = df.dropna(subset=["TotalClaims", "TotalPremium"])

# Fill NA for categorical vars
df.fillna(method='ffill', inplace=True)


  df = pd.read_csv("../data/MachineLearningRating_v3.csv", on_bad_lines='skip')
  df.fillna(method='ffill', inplace=True)


Feature Engineering

In [10]:
# Binary claim indicator
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# Margin (Target for another analysis)
df['Margin'] = df['TotalPremium'] - df['TotalClaims']


Encode Categorical Columns

In [15]:
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


MemoryError: Unable to allocate 260. MiB for an array with shape (35, 973382) and data type object

 Step 2: Claim Severity Model

Data Split

In [None]:
severity_df = df_encoded[df_encoded['HasClaim'] == 1]
X_severity = severity_df.drop(['TotalClaims', 'HasClaim'], axis=1)
y_severity = severity_df['TotalClaims']

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_severity, y_severity, test_size=0.2, random_state=42)


Model Training & Evaluation

In [12]:
models_reg = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100),
    "XGBoost": XGBRegressor()
}

for name, model in models_reg.items():
    model.fit(X_train_s, y_train_s)
    y_pred = model.predict(X_test_s)
    rmse = np.sqrt(mean_squared_error(y_test_s, y_pred))
    r2 = r2_score(y_test_s, y_pred)
    print(f"{name} - RMSE: {rmse:.2f}, R²: {r2:.2f}")


NameError: name 'X_train_s' is not defined

 Step 3: Claim Probability Model

In [None]:
X_class = df_encoded.drop(['TotalClaims', 'HasClaim'], axis=1)
y_class = df_encoded['HasClaim']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)


Model Training & Evaluation

In [None]:
models_clf = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100),
    "XGBoostClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models_clf.items():
    model.fit(X_train_c, y_train_c)
    y_pred = model.predict(X_test_c)
    acc = accuracy_score(y_test_c, y_pred)
    prec = precision_score(y_test_c, y_pred)
    rec = recall_score(y_test_c, y_pred)
    f1 = f1_score(y_test_c, y_pred)
    print(f"{name} - Accuracy: {acc:.2f}, Precision: {prec:.2f}, Recall: {rec:.2f}, F1: {f1:.2f}")


 Step 4: SHAP Explainability 

In [None]:
explainer = shap.Explainer(models_reg["XGBoost"], X_test_s)
shap_values = explainer(X_test_s)
shap.summary_plot(shap_values, X_test_s)
