# Decision Trees Parameters, Cross Validation and Hyperparameter search

In [1]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [2]:
boston = load_boston()
features = pd.DataFrame(boston['data'], columns = boston['feature_names'])
labels = pd.DataFrame(boston['target'], columns = ['labels'])
display(features.head())
display(labels.head())


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


Unnamed: 0,labels
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [3]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=1)

In [4]:
regr  = DecisionTreeRegressor(max_depth=5)
model = regr.fit(X_train, y_train)

In [5]:
print("Train data R2 was: {:.2f} ".format(regr.score(X_train, y_train)))
print("Test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))

Train data R2 was: 0.92 
Test data R2 was: 0.88


In [37]:
regr = DecisionTreeRegressor(max_depth=10,
                             criterion = 'squared_error',
                             min_samples_split=2,
                             min_samples_leaf = 1,
                             max_features = 6)
regr.fit(X_train, y_train)
print("Train data R2 was: {:.2f}".format(regr.score(X_train, y_train)))
print("Test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))


Train data R2 was: 1.00
Test data R2 was: 0.70


In [38]:
regr = DecisionTreeRegressor(max_depth=10,
                             criterion = 'squared_error',
                             min_samples_split=10,
                             min_samples_leaf = 10,
                             max_features = 3)
regr.fit(X_train, y_train)
print("Train data R2 was: {:.2f}".format(regr.score(X_train, y_train)))
print("test data R2 was: {:.2f}".format(regr.score(X_test, y_test)))

Train data R2 was: 0.79
test data R2 was: 0.77


## Cross validation

Here we're going to split the train set in several subsets called "folds" and we are going to train one model using all the folds except one.

In [39]:
regr = DecisionTreeRegressor(max_depth=10,
                             criterion = 'squared_error',
                             min_samples_split=2,
                             min_samples_leaf = 1,
                             max_features = 6)

In [40]:
from sklearn.model_selection import cross_validate

results = cross_validate(regr,X_train, y_train, cv=5) # We always do the CV on the TRAIN set. cv = K

In [41]:
results

{'fit_time': array([0.01045799, 0.00796342, 0.00590253, 0.00556731, 0.00518703]),
 'score_time': array([0.00416994, 0.00335979, 0.00370741, 0.00636744, 0.00445271]),
 'test_score': array([0.77530302, 0.69406479, 0.70102384, 0.60747976, 0.64000578])}

In [42]:
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[0.77530302 0.69406479 0.70102384 0.60747976 0.64000578]
The average R2 over the folds is: 0.68
The standard deviation of R2 over the folds is: 0.06


In [43]:
regr = DecisionTreeRegressor(max_depth=10,
                             criterion = 'squared_error',
                             min_samples_split=10,
                             min_samples_leaf = 10,
                             max_features = 3)

In [52]:
# To get the feature importance, the model must be fitted !!
regr.fit(X_train, y_train)
list(zip(X_train.columns, regr.feature_importances_))

[('CRIM', 0.01779334590490734),
 ('ZN', 0.005886287461304785),
 ('INDUS', 0.00251311181804456),
 ('CHAS', 0.0),
 ('NOX', 0.318442879013907),
 ('RM', 0.4631028338617469),
 ('AGE', 0.008850610622008152),
 ('DIS', 0.001743103088802231),
 ('RAD', 0.016564111126135882),
 ('TAX', 0.0012650516461269214),
 ('PTRATIO', 0.011091838690299147),
 ('B', 0.002282199450594624),
 ('LSTAT', 0.15046462731612245)]

In [44]:
results = cross_validate(regr,X_train, y_train, cv = 5)
print(results['test_score'])
print("The average R2 over the folds is: {:.2f}".format(results['test_score'].mean()))
print("The standard deviation of R2 over the folds is: {:.2f}".format(results['test_score'].std()))

[0.6351758  0.8035518  0.62138127 0.51499315 0.49812769]
The average R2 over the folds is: 0.61
The standard deviation of R2 over the folds is: 0.11


So we can see that the last one works better.

## Hyperparameter search

As we can see, Decission Trees have many hyperparameters to adjust. How we can find the best ones? 

There are two possibe strategies:

* Grid Search ( a collection of pre-defined hyperparameters is tested )
* Random Search ( a range of pre-deined hyperparameters is tested )

The first approach is more systematic but can be slower. The second one could be more successful.

Let's use each.

### Grid Search

In [53]:
from sklearn.model_selection import GridSearchCV

max_depth_choices= [3,10,None] # A list of the possible values of max_depth to try
criterion_choices = ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = [2,10] # A list of the possible values of min_samples_split to try
min_samples_leaf_choices = [2,10] # A list of the possible values of min_samples_leaf to try       

In [54]:
# Create the  grid 
# this is a dictionary from hyperparameters to potential values
# the keys in this dictionary have to match the names of the hyperparameters in the documentation of the model
grid = {'max_depth': max_depth_choices,
        'criterion': criterion_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices}

In [55]:
# Instantiate the grid search model object

# estimator -> model to optimize 
model = DecisionTreeRegressor()
# param_grid -> state the dictionary of parameters to optimize
# cv = 5 -> number of cross validation folds <------ CV is REALLY important in grid search. Why?
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5) # 3 * 2 * 2 * 2 = 3 * 8 = 24 -> 24*5

In [56]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [57]:
# and the winner is...
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 10}

In [58]:
# in grid search you are more likely to get really good results in your training set, even with CV
print("The best R2 for the best hyperparameters is {:.2f}".format(grid_search.best_score_))

The best R2 for the best hyperparameters is 0.79


### Random Search

In [59]:
from sklearn.model_selection import RandomizedSearchCV

max_depth_choices= [3,4,5,6,7,8,9,10,None] # A list of the possible values of max_depth to try at random
criterion_choices =  ['squared_error','absolute_error'] # A list of the possible values optimization metrics
min_samples_split_choices = [2,3,4,5,6,7,8,9,10] # A list of the possible values of min_samples_split to try
min_samples_leaf_choices = [2,3,4,5,6,7,8,9,10] # A list of the possible values of min_samples_leaf to try 
max_features_choices = [2,3,4,5,6] # A list of max_features to use in the Decission tree

random_grid = {'max_depth': max_depth_choices,
               'criterion': criterion_choices,
               'min_samples_split': min_samples_split_choices,
               'min_samples_leaf': min_samples_leaf_choices,
               'max_features': max_features_choices}

In [60]:
#trying grid search
#means building 9 * 2 * 9 * 9 * 5 * 5 = 36450 models

model = DecisionTreeRegressor()
grid_search = GridSearchCV(estimator = model, param_grid = random_grid, cv = 5)
grid_search.fit(X_train, y_train)

In [64]:
grid_search.best_params_

{'criterion': 'squared_error',
 'max_depth': 8,
 'max_features': 6,
 'min_samples_leaf': 3,
 'min_samples_split': 10}

In [61]:
print("The best R2 according to the random search is {:.2f}".format(grid_search.best_score_))

The best R2 according to the random search is 0.80


In [62]:
#and now more realistic
model = DecisionTreeRegressor()
random_search = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=25, cv=5, n_jobs=10) # n_jobs = number_processors - 2

In [63]:
random_search.fit(X_train,y_train)

In [65]:
random_search.best_params_

{'min_samples_split': 4,
 'min_samples_leaf': 5,
 'max_features': 6,
 'max_depth': 4,
 'criterion': 'squared_error'}

In [66]:
print("The best R2 according to the random search is {:.2f}".format(random_search.best_score_))

The best R2 according to the random search is 0.71
