# Machine learning - test harnesses and first algo

In [1]:
import pandas as pd
import pylab
import seaborn as sns
import numpy as np
import datetime
import copy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeRegressor

In [2]:
import utils
df = utils.get_data()
len(df)

348

In [3]:
df.head()

Unnamed: 0,month,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed,unix_days
0,1,45,45,45.6,45,43,50,44,29,1,0,0,0,0,0,0,735963
1,1,44,45,45.7,44,41,50,44,61,0,0,1,0,0,0,0,735964
2,1,45,44,45.8,41,43,46,47,56,0,0,0,1,0,0,0,735965
3,1,44,41,45.9,40,44,48,46,53,0,1,0,0,0,0,0,735966
4,1,41,40,46.0,44,46,46,46,41,0,0,0,0,0,1,0,735967


### Build test harnesses

Build a test harness for toy problem of predicting. Cross validation as and regression scoring as a function in utils; shuffles X and y, performs cross validation with negative mean absolute error scoring.

In [4]:
X = np.array(df[["month", "temp_2", "temp_1", "average", "week_Fri", "week_Mon", "week_Sat", "week_Sun", "week_Thurs", "week_Tues", "week_Wed", "unix_days"]])
y = np.array(df["actual"])
print(X.shape)
print(y.shape)

(348, 12)
(348,)


In [5]:
class UseAverageOnly(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def predict(self, X, y=None):
        # assume the average is the 3rd column (0-indexed), and use that as the prediction
        return copy.deepcopy(X[:,3])
        

In [6]:
clf = UseAverageOnly()
import utils
scores = utils.get_cross_val_scores(clf, X, y, 5)
print((scores, np.mean(scores)))

(array([-4.94285714, -5.81857143, -4.76      , -3.84637681, -5.28695652]), -4.930952380952382)


Baseline has a mean absolute error of around 5 Farenheit.

### Decision Tree
Starting with the simplest tree example, which is a single decision tree, and use it for regression.

In [7]:
decision_tree_regr = DecisionTreeRegressor()
scores = utils.get_cross_val_scores(decision_tree_regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-4.45714286, -4.78571429, -5.34285714, -4.56521739, -4.69565217]), -4.769316770186335)


The out-of-the-box hyperparams do not fair much better than the Baseline above. Reducing the tree complexity (which presmuably relates to pure overfit) seems to give mean-abs-errors around 4 Farenheit. Tweaking args of min_samples_leaf or max_leaf_nodes has a similar effect; although min_samples_leaf seems to work a little better.

In [8]:
decision_tree_regr = DecisionTreeRegressor(
    min_samples_leaf=15,
)
scores = utils.get_cross_val_scores(decision_tree_regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-4.17103725, -3.70105337, -4.4557847 , -3.56995331, -3.82159773]), -3.9438852703839187)


In [9]:
decision_tree_regr = DecisionTreeRegressor(
    max_leaf_nodes=14,
)
scores = utils.get_cross_val_scores(decision_tree_regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-3.60017311, -3.98603489, -4.12141484, -4.21285926, -4.57421195]), -4.098938809000941)


A similar effect on the mean-abs-error can also be achieved by switching on minimal cost-complexity pruning.

In [10]:
decision_tree_regr = DecisionTreeRegressor(
    ccp_alpha=0.4
)
scores = utils.get_cross_val_scores(decision_tree_regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-4.25716457, -3.86460855, -3.85724279, -4.34808314, -4.20478356]), -4.106376521335809)
