In [16]:
import pandas as pd
import numpy as np

df = pd.read_csv('claims_train.csv')
df['ClaimFrequency'] = df['ClaimNb'] / df['Exposure']
df=df[df['Exposure'] >= 0.01]
cap = df['ClaimFrequency'].quantile(0.995)
df['ClaimFrequency'] = df['ClaimFrequency'].clip(upper=cap)
df.head()

num_cols = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density', 'Exposure']
cat_cols = ['Area']

df_dummies = pd.get_dummies(df[cat_cols], drop_first=False)

X_train = pd.concat([df[num_cols], df_dummies], axis=1).values
y_train = df['ClaimFrequency'].values

class decisionTree:
    def __init__(self):
        self.tree = None

    def mse(self, y):
        return ((y - y.mean()) ** 2).mean()

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        n_samples, n_features = X.shape

        root = {"value": float(y.mean())}

        best_loss = np.inf
        best_feat = None
        best_thresh = None

        for feat_idx in range(n_features):
            values = X[:, feat_idx]
            unique_vals = np.unique(values)
            if unique_vals.size == 1:
                continue

            thresholds = (unique_vals[:-1] + unique_vals[1:]) / 2.0

            for t in thresholds:
                left_mask = values <= t
                right_mask = ~left_mask

                if left_mask.sum() == 0 or right_mask.sum() == 0:
                    continue

                y_left = y[left_mask]
                y_right = y[right_mask]

                mse_left = self.mse(y_left)
                mse_right = self.mse(y_right)

                loss = (left_mask.sum() * mse_left + right_mask.sum() * mse_right) / n_samples

                if loss < best_loss:
                    best_loss = loss
                    best_feat = feat_idx
                    best_thresh = float(t)

        if best_feat is None:
            root["is_leaf"] = True
            self.tree = root
            return

        root["is_leaf"] = False
        root["feature_index"] = best_feat
        root["threshold"] = best_thresh

        values = X[:, best_feat]
        left_mask = values <= best_thresh
        right_mask = ~left_mask

        root["left"] = {
            "is_leaf": True,
            "value": float(y[left_mask].mean())
        }
        root["right"] = {
            "is_leaf": True,
            "value": float(y[right_mask].mean())
        }

        self.tree = root

    def predict_row(self, row, node):
        if node["is_leaf"]:
            return node["value"]
        feat_idx = node["feature_index"]
        thresh = node["threshold"]
        if row[feat_idx] <= thresh:
            child = node["left"]
        else:
            child = node["right"]
        return child["value"]

    def predict(self, X):
        X = np.asarray(X)
        return np.array([self.predict_row(row, self.tree) for row in X])


def rmse(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sqrt(((y_true - y_pred) ** 2).mean())

tree = decisionTree()
tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)

print(tree.tree)
print("Training RMSE:", rmse(y_train, y_train_pred))


{'value': 0.13825320390506804, 'is_leaf': False, 'feature_index': 1, 'threshold': 0.5, 'left': {'is_leaf': True, 'value': 0.4363146882778197}, 'right': {'is_leaf': True, 'value': 0.11089006911462748}}
Training RMSE: 0.8311512580908458
