In [66]:
import pandas as pd
import numpy as np
import pprint 

df = pd.read_csv('claims_train.csv')
df['ClaimFrequency'] = df['ClaimNb'] / df['Exposure']
df=df[df['Exposure'] >= 0.01]
cap = df['ClaimFrequency'].quantile(0.995)
df['ClaimFrequency'] = df['ClaimFrequency'].clip(upper=cap)
df.head()

num_cols = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density', 'Exposure']
cat_cols = ['Area']

df_dummies = pd.get_dummies(df[cat_cols], drop_first=False)

X_train = pd.concat([df[num_cols], df_dummies], axis=1).values
y_train = df['ClaimFrequency'].values

class decisionTree:
    def __init__(self, min_samples_leaf=100, min_samples_split=20, max_depth=5):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        self.tree = self.buildTree(X, y, 0)

    def predict(self, X):
        X = np.asarray(X)
        return np.array([self.predict_row(row, self.tree) for row in X])

    def mse(self, y):
        return ((y-y.mean())**2).mean()

    def buildTree(self, X, y, depth):
        n_samples, n_features = X.shape
        node = {}
        node["value"] = float(y.mean())

        best_loss = np.inf
        best_thresh = None
        best_feat = None

        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            np.unique(y).size == 1):
            node['is_leaf'] = True
            return node

        for feat_idx in range(n_features):
            values = X[:, feat_idx] #VehAge
            unique_vals = np.unique(values)
            if unique_vals.size == 1:
                continue
            trashholds = (unique_vals[:-1] + unique_vals[1:]) / 2
            for t in trashholds:
                left_mask = values <= t
                right_mask = ~left_mask

                n_left = left_mask.sum()
                n_right = right_mask.sum()

                if n_left < self.min_samples_leaf or n_right < self.min_samples_leaf:
                    continue
                
                y_left = y[left_mask]
                y_right = y[right_mask]

                mse_left = self.mse(y_left)
                mse_right = self.mse(y_right)

                loss = (n_left * mse_left + n_right * mse_right) / (n_samples)

                if loss < best_loss:
                    best_loss = loss
                    best_thresh = t
                    best_feat = feat_idx
        
        if best_feat is None:
            node['is_leaf'] = True
            return node

        node['is_leaf'] = False
        node['feature_index'] = best_feat
        node['threshold'] = best_thresh

        values = X[:, best_feat]
        left_mask = values <= best_thresh
        right_mask = ~left_mask

        node['left'] = self.buildTree(X[left_mask], y[left_mask], depth + 1)
        node['right'] = self.buildTree(X[right_mask], y[right_mask], depth + 1)

        return node

    def predict_row(self, row, node):
        while not node['is_leaf']:
            feat_idx = node['feature_index']
            thresh = node['threshold']
            if row[feat_idx] <= thresh:
                node = node['left']
            else:
                node = node['right']
        return node['value']


def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(((y_true - y_pred) ** 2).mean())

tree = decisionTree(
    max_depth=12,
    min_samples_split=200,
    min_samples_leaf=10,
)

tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)

print("Training RMSE:", rmse(y_train, y_train_pred))


Training RMSE: 0.8137979212454955
