# Decision Tree Regression & Cross-Validation Practical Implementation

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [52]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)

In [53]:
df = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
df


array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [54]:
target = raw_df.values[1::2, 2]
target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [55]:
column_names = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", 
    "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"
]

In [56]:
boston_df = pd.DataFrame(df, columns=column_names)
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [57]:
# independent features
X = boston_df
# dependent features
y = target

In [58]:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [59]:
y

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [60]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42)

In [61]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()

In [62]:
regressor.fit(X_train, y_train)

In [63]:
y_predict = regressor.predict(X_test)
y_predict

array([23.9, 32. , 13.3, 24.1, 19.4, 18.5, 22.5, 14.9, 22.7, 21.2, 27.1,
       27.1,  5. , 22.2, 18.7, 25. , 15.2,  7.4, 50. , 14.6, 24.7, 23.1,
       13.6, 22.8, 12.7, 15.2, 21.7, 13.4, 19.4, 20. , 18.8, 23.1, 10.4,
       21.9, 16.7, 15.6, 33.1, 18.7, 20.4, 24.1, 20. , 30.1, 50. , 17.1,
       19.8, 12.7, 13.6, 24.1, 20. , 32. , 23.4, 33.4, 21.7, 30.1, 43.1,
       19.9, 18. , 26.6, 20.5, 22.5, 24.5, 32.9, 29.4, 18.2, 27.9, 13.6,
       15.4, 23. , 27.9, 13.4, 22.6, 28.7, 11.8, 23.5, 21.4,  5. , 19.8,
       42.3, 13.3,  8.1, 21.7, 11.7, 18.6, 11.8, 20.3, 28.4, 14.9, 23.1,
       23.1, 18. , 23.3,  5. , 19.2, 19.3, 23.3, 19.2, 50. , 11.9, 16.7,
       13.9, 17.5, 28.1, 14.6, 20.4, 21.1, 10.2, 20.4, 24.8, 17.5, 21.9,
       11.8, 14.5, 22.2, 29.6, 31.7, 13.5, 50. , 14.8, 21.2, 23.5, 16.2,
       24.8,  5.6, 20.3, 24.7, 23.1, 23.3, 37.2, 21.4, 46. , 15.2, 25. ,
       18.2, 18.4, 14.6, 22.3, 17.5, 31.1, 29.8, 16. , 22.5, 23.5, 18.7,
       16.6,  5. , 19.9, 15.6,  8.5, 13.6, 44.8, 14

In [64]:
from sklearn.metrics import r2_score

score = r2_score(y_predict, y_test)
score

0.8528621796925602

# Hyper parameter Tunning

In [87]:
parameters = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'max_features': ['auto', 'sqrt', 'log2']  # 'auto' replaced with None
}

In [88]:
regressor = DecisionTreeRegressor()

In [89]:
from sklearn.model_selection import GridSearchCV

regressor_cv = GridSearchCV(
    regressor,
    param_grid=parameters,
    cv=5,
    scoring='neg_mean_squared_error'
)

In [90]:
regressor_cv.fit(X_train, y_train)

480 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\user\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parame

In [91]:
regressor_cv.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 7,
 'max_features': 'log2',
 'splitter': 'best'}

In [92]:
y_predict = regressor_cv.predict(X_test)
y_predict

array([22.47297297, 30.89166667, 16.925     , 22.47297297, 23.2       ,
       22.47297297, 18.43529412, 16.925     , 21.16363636, 22.47297297,
       18.43529412, 18.43529412,  8.5       , 22.47297297, 18.6875    ,
       25.        , 21.4       ,  9.25      , 47.75      , 14.2       ,
       22.47297297, 22.47297297, 15.57272727, 26.49166667, 14.64      ,
       15.57272727, 22.47297297, 14.2       , 18.43529412, 18.6875    ,
       18.43529412, 22.47297297, 17.8       , 17.23333333, 15.57272727,
       14.64      , 28.7       , 20.46521739, 20.7       , 22.47297297,
       20.7       , 26.49166667, 47.75      , 18.6875    , 22.47297297,
       23.2       , 15.57272727, 22.47297297, 18.43529412, 30.89166667,
       22.47297297, 34.48235294, 18.43529412, 26.49166667, 48.7       ,
       20.46521739, 23.2       , 30.89166667, 22.47297297, 20.7       ,
       26.49166667, 34.48235294, 30.89166667, 20.46521739, 30.89166667,
       18.43529412, 15.        , 22.47297297, 30.89166667, 14.64

In [93]:
r2_score(y_predict, y_test)

0.7228295848949124