In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import matplotlib
import numpy as np
from scipy.optimize import minimize_scalar
from sklearn.metrics import mean_squared_error, mean_absolute_error
#rc('text', usetex=True)
matplotlib.rcParams['mathtext.fontset'] = 'cm'
matplotlib.rcParams['mathtext.rm'] = 'serif'
matplotlib.rc('xtick', labelsize=13) 
matplotlib.rc('ytick', labelsize=13) 

bookcolors = {'crimson': '#a50026', 'red': '#d73027', 'redorange': '#f46d43',
              'orange': '#fdae61', 'yellow': '#fee090', 'sky': '#e0f3f8', 
              'babyblue': '#abd9e9', 'lightblue': '#74add1', 'blue': '#4575b4',
              'purple': '#313695'}

def draw_vector(ax, x, y, dx, dy, yrange):
    ax.plot([x,x+dx], [y,y+dy], c='r', linewidth=.8)
    ay = y+dy
    yrange *= 0.03
    ad = -yrange if dy>=0 else yrange
    ax.plot([x+dx-4,x+dx], [ay+ad,ay], c='r', linewidth=.8)
    ax.plot([x+dx,x+dx+4], [ay,ay+ad], c='r', linewidth=.8)
    
def data():
    df = pd.DataFrame(data={"sqfeet":[700,950,800,900,750]})
    df["rent"] = pd.Series([1125,1350,1135,1300,1150])
    df = df.sort_values('sqfeet')
    return df


In [29]:
class Stub:
    def __init__(self, X, residual, split):
        """
        We train on the residual or the sign vector but only to get
        the regions in the leaves with y_i. Then we grab mean/median
        of residual, y_i - F_{m-1}, in that region (L2/L1).
        """
        self.X, self.residual, self.split = X, residual, split
        self.left = self.residual[self.X<self.split]
        self.right = self.residual[self.X>=self.split]
        
    def l2predict(self,x):
        lmean = np.mean(self.left)
        rmean = np.mean(self.right)
        return lmean if x < self.split else rmean
        
    def l1predict(self,x):
        lmed = np.median(self.left)
        rmed = np.median(self.right)
        return lmed if x < self.split else rmed

In [3]:
df = data()
df

Unnamed: 0,sqfeet,rent
0,700,1125
4,750,1150
2,800,1135
3,900,1300
1,950,1350


## Train tree using sign vector

In [17]:
def boost(df, xcol, ycol, splits, eta, stages):
    """
    Update df to have direction_i, delta_i, F_i.
    Return MSE, MAE
    """
    f0 = df[ycol].median()
    df['F0'] = f0

    for s in range(1,M+1):
        df[f'res{s}'] = df[ycol] - df[f'F{s-1}']
        df[f'sign{s}'] = np.sign(df[f'res{s}'])
        t = Stub(df.sqfeet, df[f'res{s}'], splits[s])
        df[f'delta{s}'] = [t.l1predict(x) for x in df[xcol]]
        df[f'F{s}'] = df[f'F{s-1}'] + eta * df[f'delta{s}']


In [33]:
M = 3
eta = 0.7
splits = [None,850, 925, 725] # manually pick them
boost(df, 'sqfeet', 'rent', splits, eta, M)
df

Unnamed: 0,sqfeet,rent,F0,res1,sign1,delta1,F1,res2,sign2,delta2,F2,res3,sign3,delta3,F3
0,700,1125,1150.0,-25.0,-1.0,-15.0,1139.5,-14.5,-1.0,3.0,1141.6,-16.6,-1.0,-16.6,1129.98
4,750,1150,1150.0,0.0,0.0,-15.0,1139.5,10.5,1.0,3.0,1141.6,8.4,1.0,15.825,1152.6775
2,800,1135,1150.0,-15.0,-1.0,-15.0,1139.5,-4.5,-1.0,3.0,1141.6,-6.6,-1.0,15.825,1152.6775
3,900,1300,1150.0,150.0,1.0,175.0,1272.5,27.5,1.0,3.0,1274.6,25.4,1.0,15.825,1285.6775
1,950,1350,1150.0,200.0,1.0,175.0,1272.5,77.5,1.0,77.5,1326.75,23.25,1.0,15.825,1337.8275


In [35]:
mse = [mean_squared_error(df.rent, df['F'+str(s)]) for s in range(M+1)]
mae = [mean_absolute_error(df.rent, df['F'+str(s)]) for s in range(M+1)]
print(mse,mae)

[12670.0, 1420.65, 315.08050000000037, 139.55343499999958] [78.0, 26.9, 16.05, 10.365999999999985]
