In [1]:
import numpy as np

In [159]:
def mse(y1, y2):
    return np.sum((y1 - y2)**2)

class Node:
    def __init__(self, feature=None, threshold=None):
        self.feature = feature
        self.threshold = threshold
        self.left = None
        self.right = None
        self.answer = None
        
    def fit(self, data, max_height):
        if data.shape[0] == 1 or max_height == 0:
            self.answer = np.mean(data[:,-1])
            return
        best_feature = 0
        best_thrs = 0
        best_sep = 0
        best_error = None
        for feature in xrange(data.shape[1]):
            data = data[data[:,feature].argsort()]
            for sep in xrange(1, data.shape[0]):
                left = data[:sep]
                right = data[sep:]
                means = [np.mean(part) for part in [left[:,-1], right[:,-1]]]
                error = mse(left, means[0]) + mse(right, means[1])
                if best_error is None or error < best_error:
                    best_error = error
                    best_feature = feature
                    best_thrs = (data[sep][-1] + data[sep - 1][-1]) / 2
                    best_sep = sep
        self.feature = best_feature
        self.threshold = best_thrs
        data = data[data[:,best_feature].argsort()]
        left = data[:best_sep]
        right = data[best_sep:]
        self.left = Node()
        self.left.fit(left, max_height - 1)
        self.right = Node()
        self.right.fit(right, max_height - 1)
    
    def predict(self, x):
        if self.answer:
            return self.answer
        if x[self.feature] >= self.threshold:
            return self.right.predict(x)
        else:
            return self.left.predict(x)
    
    def __str__(self):
        if self.answer:
            return "={}".format(self.answer)
        else:
            return "<{}=={} {} {}>".format(self.feature, self.threshold, self.left, self.right)
                
            
        
class DecisionTree:
    def __init__(self, max_height=100):
        self.max_height = max_height
        self.root = None
    
    def fit(self, x, y):
        data = np.concatenate([x, y.reshape((-1, 1))], axis=1)
        self.root = Node()
        self.root.fit(data, self.max_height)
    
    def predict(self, x):
        ans = [
            self.root.predict(row) for row in x
        ]
        return ans
    
    def __str__(self):
        return str(self.root)
        
    

In [160]:
import sklearn.datasets
boston = sklearn.datasets.load_boston()

In [161]:
from sklearn.model_selection import train_test_split

In [162]:
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target)

In [163]:
tree = DecisionTree(3)

In [164]:
tree.fit(x_train, y_train)

In [165]:
from sklearn.metrics import mean_squared_error
mean_squared_error(tree.predict(x_test), y_test)

67.978582677165363

In [173]:
tree = DecisionTree(3)
tree.fit(x_train, y_train)
mean_squared_error(tree.predict(x_test), y_test)

67.978582677165363