In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import load_digits, load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from DecisionTree import DecisionTree

### Classification

We will use digit dataset from sklearn

In [2]:
digits = load_digits()

 train_test_split with test sizeof 0.15

In [3]:
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=17)

Create tree of depth 4 and fit. Its not tuned, so the results are not that great.<br>
The default criterion is gini

In [4]:
tr = DecisionTree(max_depth=4)
tr.fit(X_train, y_train)
preds = tr.predict(X_test)
accuracy_score(y_test, preds)

0.5666666666666667

Let's try entropy criterion

In [5]:
tre = DecisionTree(max_depth=4, criterion='entropy')
tre.fit(X_train, y_train)
preds1 = tre.predict(X_test)
accuracy_score(y_test, preds1)

0.6361111111111111

Let's do grid search with max_depth between 3 to 11 and criterion -gini and entropy

In [6]:
tree_params = {'max_depth': range(3, 11)}

In [7]:
tr = DecisionTree()
tr_grid = GridSearchCV(tr, tree_params, cv=5, verbose=True, scoring='accuracy')
tr_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  7.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='gini', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [8]:
tr_grid.best_params_, tr_grid.best_score_

({'max_depth': 9}, 0.8399443284620738)

In [9]:
grid_pred = tr_grid.predict(X_test)

In [10]:
accuracy_score(y_test, grid_pred)

0.8305555555555556

Now lets try entropy

In [11]:
tr1 = DecisionTree(criterion='entropy')
tr_grid1 = GridSearchCV(tr1, tree_params, cv=5, verbose=True, scoring='accuracy')
tr_grid1.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  8.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='entropy', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [12]:
tr_grid1.best_params_, tr_grid1.best_score_

({'max_depth': 8}, 0.8663883089770354)

In [13]:
grid_pred1 = tr_grid1.predict(X_test)

In [14]:
accuracy_score(y_test, grid_pred1)

0.8833333333333333

### Regression Tree

We'll use boston house pricing dataset from sklearn for regression problem. Here the criterion we use are variance and mad_median and for scoring we'll use mean_squared_error/neg_mean_squared_error

In [15]:
ds = load_boston()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(ds.data, ds.target, test_size=0.2, random_state=17)

In [17]:
tr_reg = DecisionTree(max_depth=4, criterion='variance')
tr_reg.fit(X_train, y_train)
pred_reg = tr_reg.predict(X_test)
mean_squared_error(y_test, pred_reg)

13.626382527384985

In [18]:
tr_reg1 = DecisionTree(max_depth=4, criterion='mad_median')
tr_reg1.fit(X_train, y_train)
pred_reg1 = tr_reg1.predict(X_test)
mean_squared_error(y_test, pred_reg1)

40.2708660909879

grid search with max_depth between 3 to 11

In [19]:
tree_params = {'max_depth': range(3, 11)}

In [20]:
tr_reg2 = DecisionTree(criterion='variance')
tr_reg_grid = GridSearchCV(tr_reg2, tree_params, cv=5 , verbose=True, scoring='neg_mean_squared_error')
tr_reg_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='variance', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=True)

In [21]:
tr_reg_grid.best_params_, tr_reg_grid.best_score_

({'max_depth': 5}, -21.35204908672389)

In [22]:
tr_reg_preds = tr_reg_grid.predict(X_test)

In [23]:
mean_squared_error(y_test, tr_reg_preds)

12.778854727585674

With criterion mad_median

In [24]:
tr_reg3 = DecisionTree(criterion='mad_median')
tr_reg_grid1 = GridSearchCV(tr_reg3, tree_params, cv=5 , verbose=True, scoring='neg_mean_squared_error')
tr_reg_grid1.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  3.8min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTree(criterion='mad_median', debug=False,
                                    max_depth=inf, min_samples_split=2),
             iid='warn', n_jobs=None, param_grid={'max_depth': range(3, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=True)

In [25]:
tr_reg_grid1.best_params_, tr_reg_grid1.best_score_

({'max_depth': 10}, -46.26327320232432)

In [26]:
tr_reg_preds1 = tr_reg_grid1.predict(X_test)

In [27]:
mean_squared_error(y_test, tr_reg_preds1)

38.663449747070636