Train and fine-tune a Decision Tree for moons dataset.

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
moons = make_moons(n_samples=10000, noise=0.4, random_state=22)
X, y = moons[0], moons[1]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
DT_clf = DecisionTreeClassifier(random_state = 22)
params = {'max_leaf_nodes': list(range(2, 100)),'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DT_clf,param_grid = params, n_jobs = -1, verbose=1)
grid_search_cv.fit(X_train,y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:    3.2s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=22,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None

In [5]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=22,
            splitter='best')

In [6]:
# by default GridSearchCV train the best model on whole training dataset. so we can simply evaluate the model's accuracy

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test,y_pred)

0.8604

Not bad! 

Let's try to grow a forest.

By generating 1,000 subsets of the training set, each containing 100 instances selected randomly. And then perform majority-vote predictions over the test set.

In [38]:
from sklearn.model_selection import ShuffleSplit
n_trees = 1000
n_instances = 100

rs = ShuffleSplit(n_splits=n_trees, train_size=n_instances, random_state=22)
mini_set = []
for train_index, test_index in rs.split(X):
    X_mini_train = X[train_index]
    y_mini_train = y[train_index]
    mini_set.append((X_mini_train,y_mini_train))



In [39]:
from sklearn.base import clone
forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

scores = []

for tree, data in zip(forest,mini_set):
    X_mini_train = data[0]
    y_mini_train = data[1]
    
    tree.fit(X_mini_train,y_mini_train)
    
    y_mini_pred = tree.predict(X_test)
    scores.append(accuracy_score(y_test,y_mini_pred))


In [40]:
import numpy as np
np.mean(scores)

0.7978052

Try majority-vote predictions over the test set.

In [47]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)
for tree_ind, tree in enumerate(forest):
    Y_pred[tree_ind] = tree.predict(X_test)

In [52]:
from scipy.stats import mode
y_pred, count = mode(Y_pred, axis = 0)

In [54]:
accuracy_score(y_test,y_pred.reshape([-1]))

0.856