# Exercises from chapter 7

## Ex. 8

In [56]:
import joblib

CALCULATE = False

In [32]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [33]:
import numpy as np
import pandas as pd

X, y = mnist["data"], mnist["target"]
y.shape

(70000,)

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=10000, random_state=42)

print(X_train.shape, X_valid.shape, X_test.shape)

(50000, 784) (10000, 784) (10000, 784)


#### Decision Tree Classifier

In [59]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    "max_depth": [20, 35, 50],
    "max_leaf_nodes":  [250, 350, 500, 1000]
}

tree_clf = DecisionTreeClassifier()
gridCV = GridSearchCV(tree_clf, params, scoring="accuracy", cv=2, verbose=3)

if CALCULATE:
    gridCV.fit(X_train, y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV 1/2] END ..max_depth=20, max_leaf_nodes=250;, score=0.826 total time=   3.7s
[CV 2/2] END ..max_depth=20, max_leaf_nodes=250;, score=0.834 total time=   3.7s
[CV 1/2] END ..max_depth=20, max_leaf_nodes=350;, score=0.834 total time=   3.5s
[CV 2/2] END ..max_depth=20, max_leaf_nodes=350;, score=0.845 total time=   3.6s
[CV 1/2] END ..max_depth=20, max_leaf_nodes=500;, score=0.841 total time=   4.1s
[CV 2/2] END ..max_depth=20, max_leaf_nodes=500;, score=0.854 total time=   3.9s
[CV 1/2] END .max_depth=20, max_leaf_nodes=1000;, score=0.844 total time=   4.5s
[CV 2/2] END .max_depth=20, max_leaf_nodes=1000;, score=0.854 total time=   4.1s
[CV 1/2] END ..max_depth=35, max_leaf_nodes=250;, score=0.826 total time=   3.3s
[CV 2/2] END ..max_depth=35, max_leaf_nodes=250;, score=0.834 total time=   3.9s
[CV 1/2] END ..max_depth=35, max_leaf_nodes=350;, score=0.834 total time=   3.4s
[CV 2/2] END ..max_depth=35, max_leaf_nodes=350;

In [60]:
if CALCULATE:
    
    print(gridCV.best_estimator_)
    print(gridCV.best_score_)

DecisionTreeClassifier(max_depth=20, max_leaf_nodes=1000)
0.8488


In [62]:
from sklearn.metrics import accuracy_score

if not CALCULATE:
    best_tree = gridCV.best_estimator_
    joblib.dump(best_tree, "models/14/tree.pkl")
else:
    best_tree = joblib.load("models/14/tree.pkl")

y_tree = best_tree.predict(X_valid)
accuracy_score(y_valid, y_tree)


0.8731

#### Extra-Trees Classifier

In [38]:
from sklearn.ensemble import ExtraTreesClassifier

params = {
    "n_estimators": [100, 250],
    "criterion": ["gini", "entropy"],
    "max_depth": [25, 50]
}

extra_tree_clf = ExtraTreesClassifier()

gridCV2 = GridSearchCV(extra_tree_clf, params, scoring="accuracy", cv=2, verbose=3)

if CALCULATE:
    gridCV2.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV 1/2] END criterion=gini, max_depth=25, n_estimators=100;, score=0.965 total time=  12.4s
[CV 2/2] END criterion=gini, max_depth=25, n_estimators=100;, score=0.965 total time=  11.7s
[CV 1/2] END criterion=gini, max_depth=25, n_estimators=250;, score=0.965 total time=  29.2s
[CV 2/2] END criterion=gini, max_depth=25, n_estimators=250;, score=0.967 total time=  30.8s
[CV 1/2] END criterion=gini, max_depth=50, n_estimators=100;, score=0.965 total time=  12.2s
[CV 2/2] END criterion=gini, max_depth=50, n_estimators=100;, score=0.965 total time=  12.2s
[CV 1/2] END criterion=gini, max_depth=50, n_estimators=250;, score=0.966 total time=  31.5s
[CV 2/2] END criterion=gini, max_depth=50, n_estimators=250;, score=0.968 total time= 2.2min
[CV 1/2] END criterion=entropy, max_depth=25, n_estimators=100;, score=0.963 total time=  14.8s
[CV 2/2] END criterion=entropy, max_depth=25, n_estimators=100;, score=0.964 total time=  11.7s
[CV 

In [39]:
if CALCULATE:
    print(gridCV2.best_estimator_)
    print(gridCV2.best_score_)

ExtraTreesClassifier(max_depth=50, n_estimators=250)
0.96712


In [61]:
if CALCULATE:
    best_extra_tree = gridCV2.best_estimator_
    joblib.dump(best_extra_tree, "models/14/extra_tree.pkl")
else:
    best_extra_tree = joblib.load("models/14/extra_tree.pkl")

y_extra = best_extra_tree.predict(X_valid)
accuracy_score(y_valid, y_extra)

0.9731

####  SVM

In [41]:
from sklearn.ensemble import RandomForestClassifier

params = {
    "n_estimators": [100, 250],
    "max_depth": [10, 25]
}

rnd_clf = RandomForestClassifier()
gridCV3 = GridSearchCV(rnd_clf, params, cv=2, verbose=3)

if CALCULATE:
    gridCV3.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END ....max_depth=10, n_estimators=100;, score=0.944 total time=   9.6s
[CV 2/2] END ....max_depth=10, n_estimators=100;, score=0.943 total time=   8.9s
[CV 1/2] END ....max_depth=10, n_estimators=250;, score=0.945 total time=  22.6s
[CV 2/2] END ....max_depth=10, n_estimators=250;, score=0.945 total time=  23.5s
[CV 1/2] END ....max_depth=25, n_estimators=100;, score=0.960 total time=  11.6s
[CV 2/2] END ....max_depth=25, n_estimators=100;, score=0.962 total time=  12.2s
[CV 1/2] END ....max_depth=25, n_estimators=250;, score=0.962 total time=  29.6s
[CV 2/2] END ....max_depth=25, n_estimators=250;, score=0.964 total time=  30.2s


In [42]:
if CALCULATE:
    print(gridCV3.best_estimator_)
    print(gridCV3.best_score_)

RandomForestClassifier(max_depth=25, n_estimators=250)
0.96266


In [43]:
if CALCULATE:
    best_forest = gridCV3.best_estimator_
    joblib.dump(best_forest, "models/14/forest.pkl")
else:
    best_forest = joblib.load("models/14/forest.pkl")
    
y_forest = best_forest.predict(X_valid)
accuracy_score(y_valid, y_forest)

0.9705

#### Voting Classifier

In [63]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators= [("forest", best_forest), ("extra", best_extra_tree), ("tree", best_tree)],
    voting = "hard")

voting_clf.fit(X_train, y_train)

In [64]:
y_pred = voting_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

0.9712

In [65]:
y_pred_test = best_tree.predict(X_test)
print("Tree: ", accuracy_score(y_test, y_pred_test))
y_pred_test = best_extra_tree.predict(X_test)
print("Extra Trees: ", accuracy_score(y_test, y_pred_test))
y_pred_test = best_forest.predict(X_test)
print("Forest: ", accuracy_score(y_test, y_pred_test))
y_pred_test = voting_clf.predict(X_test)
print("Voting: ", accuracy_score(y_test, y_pred_test))

Tree:  0.8693
Extra Trees:  0.9688
Forest:  0.9658
Voting:  0.9668


## Ex. 9

In [73]:
y_pred_tree = best_tree.predict(X_valid)
y_pred_extra = best_extra_tree.predict(X_valid)
y_pred_forest = best_forest.predict(X_valid)

X_train_cont = np.c_[y_pred_tree, y_pred_extra, y_pred_forest]
y_train_cont = y_valid

In [74]:
forest_cont = RandomForestClassifier()

forest_cont.fit(X_train_cont, y_train_cont)

In [76]:
y_pred_tree = best_tree.predict(X_test)
y_pred_extra = best_extra_tree.predict(X_test)
y_pred_forest = best_forest.predict(X_test)

X_test_cont = np.c_[y_pred_tree, y_pred_extra, y_pred_forest]

y_pred_cont = forest_cont.predict(X_test_cont)
accuracy_score(y_test, y_pred_cont)

0.9664