## <span style="color: blue">Perform Grid Search on Decision Tree to find optimal set of hyperparameters</span>

In [224]:
from sklearn.datasets import make_moons
import numpy as np
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [225]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
# define a dictionary of hyperparameters and a list of options for each one to test for the optimal solution
params = {
    'min_samples_split': [2, 3], 
    'max_leaf_nodes': list(range(2, 31)),
}
clf = GridSearchCV(tree, params)
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3], 'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [226]:
best_tree = clf.best_estimator_
best_params = clf.best_params_
best_params

{'max_leaf_nodes': 18, 'min_samples_split': 2}

### Cross validation

In [227]:
y_pred = best_tree.predict(X_test)
accuracy_score = np.mean(y_pred == y_test)
accuracy_score

0.87

## <span style="color: blue">Grow a Random Forest</span>

### Set up training data splitter

In [228]:
from sklearn.model_selection import ShuffleSplit
splitter = ShuffleSplit(n_splits=1000, train_size=100) # test_size becomes the complement of train_size
num_splits = splitter.get_n_splits(X_train)
num_splits



1000

### Create a 1000-tree forest, each tree trained on a 100-instance subset of the data

In [229]:
vtree = np.vectorize(DecisionTreeClassifier)
forest = np.empty((1, num_splits), dtype=object)
forest[:] = DecisionTreeClassifier(
    max_leaf_nodes=best_params['max_leaf_nodes'], 
    min_samples_split=best_params['min_samples_split']
) 
accuracy_scores = np.empty((1, num_splits), dtype=np.dtype('float'))
i = 0
for train, test in splitter.split(X_train):
    X_train_sub = X[train] # where train is an array containing the indices to extract from X
    y_train_sub = y[train]
    forest[0, i].fit(X_train_sub, y_train_sub)
    X_test_sub = X[test]
    y_test_sub = y[test]
    y_pred = forest[0, i].predict(X_test_sub)
    accuracy_scores[0, i] = np.mean(y_pred == y_test_sub)
    i+=1

forest_avg_accuracy = np.mean(accuracy_scores)
forest_avg_accuracy # ~80% accuracy over 1000 trees trained on 100 subsets, which is pretty good!

0.7988112500000001

In [230]:
from scipy.stats import mode

y_pred = np.empty((num_splits, len(X_test)), dtype=np.dtype('int'))
for index, tree in enumerate(forest[0]):
    y_pred[index] = tree.predict(X_test)

y_pred_majority, n_votes = mode(y_pred, axis=0)
y_pred_majority

array([[0, 0, 0, ..., 1, 1, 0]])

In [231]:
np.mean(y_pred_majority == y_test) # accuracy score

0.839

The process of this random forests is correct and the score is reasonable, but unusual how it is lower than the first model. (???)  
However, it does show that the random forests does work better than a single tree in the forest, on average.