In [1]:
import pandas as pd
import numpy as np

Created ClaimFrequancy to get the expected number of claims per year

In [2]:
df = pd.read_csv('claims_train.csv')
df['ClaimFrequency'] = df['ClaimNb'] / df['Exposure']
df = pd.read_csv('claims_train.csv')
df['ClaimFrequency'] = df['ClaimNb'] / df['Exposure']


Remove outliers. Without it we would have ClaimFrequancy values of 732

In [3]:
df=df[df['Exposure'] >= 0.01]

Added a cap to ClaimFrequancy so we can avoid values of 366 ClaimFrequancies in a year

In [4]:
cap = df['ClaimFrequency'].quantile(0.995)
df['ClaimFrequency'] = df['ClaimFrequency'].clip(upper=cap)
df.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimFrequency
0,2122523.0,0,0.43,D,7,18,36,95,B1,Regular,1054,R24,0.0
1,3173420.0,0,0.1,D,7,17,80,95,B2,Regular,598,R25,0.0
2,1188619.0,0,0.33,E,7,3,36,76,B6,Regular,4172,R82,0.0
3,31400.0,0,0.56,A,5,4,73,52,B13,Diesel,15,R24,0.0
4,3138755.0,0,0.27,E,8,0,37,50,B11,Diesel,3021,R53,0.0


In [5]:
num_cols = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
cat_cols = ['Area']

df_dummies = pd.get_dummies(df[cat_cols], drop_first=False)

X_train = pd.concat([df[num_cols], df_dummies], axis=1).values
y_train = df['ClaimFrequency'].values


class decisionTree:
    def __init__(self, min_samples_leaf=100, min_samples_split=20, max_depth=5):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        self.tree = self.buildTree(X, y, 0)

    def predict(self, X):
        X = np.asarray(X)
        return np.array([self.predict_row(row, self.tree) for row in X])

    def mse(self, y):
        return ((y-y.mean())**2).mean()

    def buildTree(self, X, y, depth):
        n_samples, n_features = X.shape
        node = {}
        node["value"] = float(y.mean())

        best_loss = np.inf
        best_thresh = None
        best_feat = None

        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            np.unique(y).size == 1):
            node['is_leaf'] = True
            return node

        for feat_idx in range(n_features):
            values = X[:, feat_idx]
            unique_vals = np.unique(values)
            if unique_vals.size == 1:
                continue
            trashholds = (unique_vals[:-1] + unique_vals[1:]) / 2
            for t in trashholds:
                left_mask = values <= t
                right_mask = ~left_mask

                n_left = left_mask.sum()
                n_right = right_mask.sum()

                if n_left < self.min_samples_leaf or n_right < self.min_samples_leaf:
                    continue
                
                y_left = y[left_mask]
                y_right = y[right_mask]

                mse_left = self.mse(y_left)
                mse_right = self.mse(y_right)

                loss = (n_left * mse_left + n_right * mse_right) / (n_samples)

                if loss < best_loss:
                    best_loss = loss
                    best_thresh = t
                    best_feat = feat_idx
        
        if best_feat is None:
            node['is_leaf'] = True
            return node

        node['is_leaf'] = False
        node['feature_index'] = best_feat
        node['threshold'] = best_thresh

        values = X[:, best_feat]
        left_mask = values <= best_thresh
        right_mask = ~left_mask

        node['left'] = self.buildTree(X[left_mask], y[left_mask], depth + 1)
        node['right'] = self.buildTree(X[right_mask], y[right_mask], depth + 1)

        return node

    def predict_row(self, row, node):
        while not node['is_leaf']:
            feat_idx = node['feature_index']
            thresh = node['threshold']
            if row[feat_idx] <= thresh:
                node = node['left']
            else:
                node = node['right']
        return node['value']


def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(((y_true - y_pred) ** 2).mean())

tree = decisionTree(
    max_depth=12,
    min_samples_split=200,
    min_samples_leaf=10,
)

tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)

print('Training RMSE:', rmse(y_train, y_train_pred))


Training RMSE: 0.819263848833977


RMSE by area

In [28]:
y_pred = tree.predict(X_train)

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(((y_true - y_pred) ** 2).mean())

areas = df['Area'].unique()

print("RMSE by Area:")
for a in sorted(areas):
    mask = (df['Area'] == a)
    area_rmse = rmse(y_train[mask], y_pred[mask])
    print(f"Area {a}: {area_rmse:.4f}")

RMSE by Area:
Area A: 0.7245
Area B: 0.7704
Area C: 0.7922
Area D: 0.8614
Area E: 0.8994
Area F: 1.0782


RMSE by density quartile

In [29]:
df['DensityQuartile'] = pd.qcut(df['Density'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print("\nRMSE by Density Quartile:")
for q in ['Q1', 'Q2', 'Q3', 'Q4']:
    mask = (df['DensityQuartile'] == q)
    quartile_rmse = rmse(y_train[mask], y_pred[mask])
    print(f"{q}: {quartile_rmse:.4f}")




RMSE by Density Quartile:
Q1: 0.7423
Q2: 0.7963
Q3: 0.8304
Q4: 0.9297
