# Exercises
1. Depth of a Decision Tree with 1 million instances $= log_{2}(10^{6}) \approx 20$
2. (Generally?) lower because the CART algorithm splits the subsets recusively until it reaches the maximum depth or it cannot find a split that will reduce impurity.
3. It is a good idea to decrease `max_depth`.
4. Decision Trees don't require feature scaling.
5. $\frac{n \times 10^{7}log_{2}(10^{7})}{n \times 10^{6}log_{2}(10^{6})} = \frac{10log_{2}(10^{7})}{log_{2}(10^{6})} \approx 11.667$ <br>
    Therefore, approximately 11.667 hours.
6. No, because setting `presort=True` speed up training only when the number of instances is small (i.e. less than a few thousand instances).
7. 

In [7]:
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier

X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

grid_search_cv.best_estimator_

Fitting 3 folds for each of 294 candidates, totalling 882 fits


DecisionTreeClassifier(max_leaf_nodes=4, random_state=42)

In [8]:
from sklearn.metrics import accuracy_score

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.855

8. 

In [11]:
#a

from sklearn.model_selection import ShuffleSplit

n_trees = 1000
n_instances =100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)
for mini_train_index, mini_tes_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [15]:
#b

from sklearn.base import clone
from sklearn.metrics import accuracy_score
import numpy as np

forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

accuracy_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)

    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)

0.817115

In [20]:
#c
from scipy.stats import mode

Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

from scipy.stats import median_absolute_deviation

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.85