# Data preparation

In [1]:
!pip install -Uqq opendatasets

In [2]:
import opendatasets as od
import pandas as pd
from pathlib import Path
import numpy as np
from fastai.tabular.all import *
from sklearn.tree import DecisionTreeRegressor

In [6]:
od.download("https://www.kaggle.com/c/bluebook-for-bulldozers/data")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: nonscop
Your Kaggle Key: ··········
Downloading bluebook-for-bulldozers.zip to ./bluebook-for-bulldozers


100%|██████████| 48.4M/48.4M [00:02<00:00, 22.1MB/s]



Extracting archive ./bluebook-for-bulldozers/bluebook-for-bulldozers.zip to ./bluebook-for-bulldozers


In [13]:
path = Path("bluebook-for-bulldozers")

In [17]:
for item in path.iterdir():
    print(item)

bluebook-for-bulldozers/TrainAndValid.csv
bluebook-for-bulldozers/Train.7z
bluebook-for-bulldozers/Machine_Appendix.csv
bluebook-for-bulldozers/Data Dictionary.xlsx
bluebook-for-bulldozers/Train.zip
bluebook-for-bulldozers/median_benchmark.csv
bluebook-for-bulldozers/random_forest_benchmark_test.csv
bluebook-for-bulldozers/ValidSolution.csv
bluebook-for-bulldozers/Test.csv
bluebook-for-bulldozers/TrainAndValid.zip
bluebook-for-bulldozers/Valid.csv
bluebook-for-bulldozers/Valid.zip
bluebook-for-bulldozers/TrainAndValid.7z
bluebook-for-bulldozers/Valid.7z


In [21]:
df = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df.columns

Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'saledate', 'fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc',
       'fiModelSeries', 'fiModelDescriptor', 'ProductSize',
       'fiProductClassDesc', 'state', 'ProductGroup', 'ProductGroupDesc',
       'Drive_System', 'Enclosure', 'Forks', 'Pad_Type', 'Ride_Control',
       'Stick', 'Transmission', 'Turbocharged', 'Blade_Extension',
       'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower', 'Hydraulics',
       'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control', 'Tire_Size',
       'Coupler', 'Coupler_System', 'Grouser_Tracks', 'Hydraulics_Flow',
       'Track_Type', 'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb',
       'Pattern_Changer', 'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type',
       'Travel_Controls', 'Differential_Type', 'Steering_Controls'],
      dtype='object')

In [22]:
df['ProductSize'].unique()

array([nan, 'Medium', 'Small', 'Large / Medium', 'Mini', 'Large',
       'Compact'], dtype=object)

In [24]:
sizes = 'Large','Large / Medium','Medium','Small','Mini','Compact'
df['ProductSize'] = df['ProductSize'].astype('category')
df['ProductSize'] = df['ProductSize'].cat.set_categories(sizes, ordered=True)

In [27]:
dep_var = 'SalePrice'
df[dep_var] = np.log(df[dep_var])

In [29]:
df = add_datepart(df, 'saledate')

  df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)


In [30]:
df_test = pd.read_csv(path/'Test.csv', low_memory=False)
df_test = add_datepart(df_test, 'saledate')
' '.join(o for o in df.columns if o.startswith('sale'))

  df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)


'saleYear saleMonth saleWeek saleDay saleDayofweek saleDayofyear saleIs_month_end saleIs_month_start saleIs_quarter_end saleIs_quarter_start saleIs_year_end saleIs_year_start saleElapsed'

In [31]:
procs = [Categorify, FillMissing]

In [32]:
cond = (df.saleYear<2011) | (df.saleMonth<10)
train_idx = np.where( cond)[0]
valid_idx = np.where(~cond)[0]

splits = (list(train_idx),list(valid_idx))
cont,cat = cont_cat_split(df, 1, dep_var=dep_var)
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)


In [45]:
cont

['SalesID',
 'MachineID',
 'ModelID',
 'datasource',
 'auctioneerID',
 'YearMade',
 'MachineHoursCurrentMeter',
 'saleYear',
 'saleMonth',
 'saleWeek',
 'saleDay',
 'saleDayofweek',
 'saleDayofyear',
 'saleElapsed']

# Regression decision tree implementation from scratch

In [119]:
to = load_pickle('to.pkl')

In [120]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y

In [122]:
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

the results of sklearn tree for comparison

In [169]:
m = DecisionTreeRegressor(min_samples_leaf=25, max_leaf_nodes=16)
m.fit(to.train.xs.head(1000), to.train.y.head(1000))
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(0.547378, 0.513194)

In [170]:
class Node:
    def __init__(self, columns, xs, y, min_samples_leaf):
        self.columns = columns
        self.avg = y[xs.index].mean()
        self.xs = xs
        self.y = y[xs.index]
        self.min_samples_leaf = min_samples_leaf
        self.split_col = None
        self.threshold = None
        self.childl = None
        self.childr = None


    def gain(self, xl, xr, column):
        pl = len(xl) / len(self.xs)
        pr = len(xr) / len(self.xs)
        return np.var(self.y) - pr * np.var(self.y[xr.index]) - pl * np.var(self.y[xl.index])


    def split(self):
        best = [0]
        for column in self.xs.columns:
            for tresh in self.xs[column].unique():
                xl = self.xs[self.xs[column] <= tresh]
                xr = self.xs[self.xs[column] > tresh]
                score = self.gain(xl, xr, column)
                if score > best[0] and len(xl) >= self.min_samples_leaf and len(xr) >= self.min_samples_leaf:
                    best = [score, column, tresh]

        if len(best) == 1:
            return [0,0,0,0]

        self.split_col = best[1]
        self.threshold = best[2]

        xl = self.xs[self.xs[best[1]] <= best[2]]
        xr = self.xs[self.xs[best[1]] > best[2]]
        self.childl = Node(self.columns, xl, self.y, self.min_samples_leaf)
        self.childr = Node(self.columns, xr, self.y, self.min_samples_leaf)
        return [self.childl, self.childr]


    def predict_row(self, x):
        if self.split_col == None:
            return self.avg
        elif x[self.split_col] <= self.threshold:
            return self.childl.predict_row(x)
        else:
            return self.childr.predict_row(x)



In [171]:
class RegressionTree:
    def __init__(self, min_samples_leaf=1, max_leaf_nodes=float('inf')):
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.root = None
        self.num_of_leaves = 1


    def fit(self, xs, y):
        self.root = Node(xs.columns, xs, y, self.min_samples_leaf)
        leaves = [self.root]
        while self.num_of_leaves < self.max_leaf_nodes:
            leaf = leaves.pop(0)
            childs = leaf.split()
            self.num_of_leaves += 1
            if len(childs) == 4:
                return 0

            leaves.extend(childs)


    def predict(self, xs):
        out = []
        for _, x in xs.iterrows():
            out.append(self.root.predict_row(x))

        return np.array(out)


In [174]:
m = RegressionTree(min_samples_leaf=25, max_leaf_nodes=16)
m.fit(to.train.xs.head(1000), to.train.y.head(1000))
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

(0.55087, 0.546175)

quite ok result, the main drawback of my tree is how slow it trains and predicts, therefore it is practically useless for larger datasets, for the time I am doing this I don't know how to make it more efficient in python.