<a href="https://colab.research.google.com/github/nfilipas/handson-ml3/blob/main/exercises/chapter6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 7

In [36]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import pandas as pd

In [12]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [42]:
tree_clf = DecisionTreeClassifier(random_state=42)

params = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": list(range(2, 20)),
    "max_leaf_nodes": list(range(2, 100)),
}

grid_search = GridSearchCV(tree_clf, param_grid=params, cv=3)

grid_search.fit(X_train, y_train)

In [43]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="rank_test_score").head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_leaf_nodes,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1639,0.003773,0.000193,0.001647,0.000115,gini,10,37,random,"{'criterion': 'gini', 'max_depth': 10, 'max_le...",0.862,0.8544,0.8612,0.8592,0.00341,1
1645,0.003308,9.4e-05,0.001468,7e-05,gini,10,40,random,"{'criterion': 'gini', 'max_depth': 10, 'max_le...",0.862,0.8544,0.8604,0.858933,0.003271,2
1647,0.003475,0.000228,0.001548,6e-05,gini,10,41,random,"{'criterion': 'gini', 'max_depth': 10, 'max_le...",0.862,0.8544,0.8604,0.858933,0.003271,2


In [45]:
tree_clf_best = grid_search.best_estimator_

tree_clf_best.fit(X_train, y_train)

In [46]:
print(accuracy_score(y_test, tree_clf_best.predict(X_test)))

0.8636


# Exercise 8

In [101]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from scipy.stats import mode

import pandas as pd
import numpy as np

In [65]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [66]:
n_splits=1000
rs = ShuffleSplit(n_splits=n_splits, train_size=100, random_state=42)
X_train_list = []
y_train_list = []
for (train_indices, _) in rs.split(X_train):
    X_train_list.append(X_train[train_indices])
    y_train_list.append(y_train[train_indices])

In [67]:
estimators = []
for i in range(n_splits):
    estimator = clone(tree_clf_best)
    estimator.fit(X_train_list[i], y_train_list[i])
    estimators.append(estimator)

In [76]:
accuracies = []
for i in range(n_splits):
    accuracies.append(accuracy_score(y_test, estimator.predict(X_test)))

print(round(sum(accuracies)/len(accuracies), 2))

0.81


In [112]:
y_preds = np.empty((2500, n_splits), dtype=int)
for i in range(n_splits):
    y_preds[:, i] = estimators[i].predict(X_test)

y_pred = mode(y_preds, axis=1).mode
print(accuracy_score(y_test, y_pred))

0.866
