In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
X, y = make_moons( n_samples = 10000, noise = 0.4, random_state = 42 )
trainX, testX, trainY, testY = train_test_split(X, y, test_size = 0.17 )

In [3]:
tree = DecisionTreeClassifier()

params = { "max_depth" : [ None, *list(range(1,11)) ],
           "min_samples_split" : [2, 3, 4, 5, 10, 50, 100, 200, 500],
           "max_leaf_nodes" : [None, *list( range(2,100) )] }

gridTree = GridSearchCV( tree, params,
                         cv = 10, verbose = 1, n_jobs = 1 )

gridTree.fit(trainX, trainY)

Fitting 10 folds for each of 9801 candidates, totalling 98010 fits


[Parallel(n_jobs=1)]: Done 98010 out of 98010 | elapsed: 21.6min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 10, 50, 100, 200, 500], 'max_leaf_nodes': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 4...75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [4]:
gridTree.best_score_

0.8590191589348114

In [5]:
gridTree.best_params_

{'max_depth': 6, 'max_leaf_nodes': 33, 'min_samples_split': 50}

In [6]:
bestTree = DecisionTreeClassifier( **gridTree.best_params_ )
bestTree.fit( trainX, trainY )

pred = bestTree.predict( testX )
accuracy_score( testY, pred )

0.84891240446796001

In [12]:
from sklearn.base import clone

rs = ShuffleSplit( n_splits = 1000, train_size = 100 )

trees = []

for ind, _ in rs.split(trainX, trainY):
    tree = DecisionTreeClassifier( **gridTree.best_params_ )
    tree.fit( trainX[ind], trainY[ind] )
    
    trees.append( tree )



In [13]:
from scipy.stats import mode
import numpy as np

preds = []

i = 0

for t in trees:
    preds.append(t.predict( testX ))
    i += 1

pred = mode( preds, axis = 0 )[0][0]

accuracy_score( testY, pred )

0.84009406231628458