## 0, Load Dependencies

In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go


## 1, Generate data
I generated a synthetic dataset with 100 samples and two features, where the target variable was the product of the two features plus some noise

In [10]:
def generate_data(n_samples):
    X = np.random.normal(size=(n_samples, 2))
    y = X[:, 0] * X[:, 1] + np.random.normal(scale=0.5, size=n_samples)
    return X, y
X, y = generate_data(100)
[X,y]

[array([[-0.63738713,  1.18901653],
        [ 1.42050425, -0.57074629],
        [-0.83235557,  0.47141556],
        [-0.55222304,  0.63293182],
        [ 0.20292302, -1.51574411],
        [ 1.5475052 ,  1.79587767],
        [-0.61278869, -0.38770156],
        [ 0.28586539,  0.33445679],
        [ 0.65854427,  2.01020454],
        [-0.17694723, -0.79829724],
        [-1.37931923, -0.73093004],
        [-0.03312697,  1.79455786],
        [-0.5176113 ,  0.22378795],
        [-0.0164229 ,  1.18839327],
        [ 2.52693243, -0.53086877],
        [-0.48943944,  1.04416088],
        [ 0.68189149,  1.84670733],
        [ 0.58392819, -0.35929209],
        [ 0.59065483,  1.10870358],
        [ 0.82048218,  0.50727403],
        [ 1.06667469,  1.16929559],
        [ 1.38215899,  0.64870989],
        [-0.16711808,  0.14671369],
        [ 1.20650897, -0.81693567],
        [ 0.36867331, -0.39333881],
        [ 0.02874482,  1.27845186],
        [ 0.19109907,  0.04643655],
        [-1.35985614,  0.746

## 2, Modeling and structure the DT

In [5]:

class Node:
    def __init__(self, depth, max_depth):
        self.is_leaf = False
        self.value = None
        self.feature_index = None
        self.threshold = None
        self.left_child = None
        self.right_child = None
        self.depth = depth
        self.max_depth = max_depth
    
    def make_leaf(self, y):
        self.is_leaf = True
        self.value = np.mean(y)
    
    def split(self, X, y):
        n_samples, n_features = X.shape
        best_feature_index, best_threshold, best_mse = None, None, float("inf")
        for feature_index in range(n_features):
            for threshold in np.unique(X[:, feature_index]):
                left_indices = X[:, feature_index] < threshold
                right_indices = X[:, feature_index] >= threshold
                if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
                    continue
                left_y, right_y = y[left_indices], y[right_indices]
                mse = np.mean((left_y - np.mean(left_y)) ** 2) + np.mean((right_y - np.mean(right_y)) ** 2)
                if mse < best_mse:
                    best_feature_index, best_threshold, best_mse = feature_index, threshold, mse
        if best_feature_index is not None:
            self.feature_index = best_feature_index
            self.threshold = best_threshold
            self.left_child = Node(self.depth + 1, self.max_depth)
            self.right_child = Node(self.depth + 1, self.max_depth)
            left_indices = X[:, best_feature_index] < best_threshold
            right_indices = X[:, best_feature_index] >= best_threshold
            self.left_child.split(X[left_indices], y[left_indices])
            self.right_child.split(X[right_indices], y[right_indices])
        else:
            self.make_leaf(y)
    
    def predict(self, x):
        if self.is_leaf:
            return self.value
        if x[self.feature_index] < self.threshold:
            return self.left_child.predict(x)
        else:
            return self.right_child.predict(x)

class RegressionTree:
    def __init__(self, max_depth=3):
        self.root = Node(0, max_depth)
    
    def fit(self, X, y):
        self.root.split(X, y)
    
    def predict(self, X):
        return np.array([self.root.predict(x) for x in X])



**Regression tree** is a decision tree-based model that is used for regression problems. It works by recursively splitting the input space into smaller and smaller regions and predicting the mean of the target variable in each region. The splitting process is guided by the features of the dataset, and each split is chosen to minimize the variance of the target variable in the resulting regions.

## 3, Fitting the Model with the DT that has max depth of 3

In [6]:
model = RegressionTree(max_depth=3)
model.fit(X, y)

In [11]:
y_pred = model.predict(X)
y_pred

array([ 0.55200972, -0.91936471, -0.70600112,  0.42477908,  1.03651339,
       -2.38778228,  1.2113479 ,  0.33482422,  0.73871618,  0.14580008,
        1.19306347, -2.38778228,  0.2414448 , -0.42542122,  0.62133583,
        0.2414448 , -2.38778228,  0.64808065,  0.64808065, -1.06163724,
       -0.04288516, -0.91936471,  0.08117911, -0.91936471, -0.15806771,
       -0.42542122, -0.26834058,  1.19306347,  0.84805792, -0.58020947,
        0.75626373,  0.08117911, -0.36501166,  1.29660955,  0.34559269,
       -0.69455782, -0.19082853,  0.42477908, -0.91936471,  0.73871618,
       -1.88195792, -0.22157537,  0.48358366, -0.69455782, -1.50123484,
       -0.36501166,  1.93563305, -1.15342618,  0.08117911,  0.08117911,
       -1.50123484, -0.42542122, -0.15806771,  0.64808065, -0.36501166,
       -0.26834058, -0.42542122, -2.38778228,  0.73095326, -0.69455782,
       -0.35696109, -2.8034864 , -1.88195792, -2.48486597, -1.06163724,
       -0.70600112,  1.27343641, -0.91936471,  0.08117911, -2.80

## 4, Evaluate the Model

In [8]:
mse = np.mean((y - y_pred) ** 2)
print("Mean squared error:", mse)


Mean squared error: 0.0


In [9]:
scatter = go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', marker=dict(color=y_pred, colorscale='Viridis', showscale=True))
layout = go.Layout(title="Regression Tree", xaxis=dict(title="X1"), yaxis=dict(title="X2"))
fig = go.Figure(data=[scatter], layout=layout)
fig.show()

The scatter plot shows the input data points, with their colors indicating their predicted values according to the regression tree.