# Setup

In [125]:
import sys
import os
import numpy as np
import pandas as pd
from numpy.typing import NDArray
from sklearn.model_selection import cross_val_score

RANDOM_SEED = 42

sys.path.append(os.path.abspath("."))
np.random.seed(RANDOM_SEED)

## Train + Fine Tune Decision Tree for Moons Dataset

In [126]:
from sklearn.datasets import make_moons

X_moons, y_moons = make_moons(n_samples=10_000, noise=0.4, random_state=RANDOM_SEED)

In [127]:
X_moons

array([[ 0.9402914 ,  0.12230559],
       [ 0.12454026, -0.42477546],
       [ 0.26198823,  0.50841438],
       ...,
       [-0.24177973,  0.20957199],
       [ 0.90679645,  0.54958215],
       [ 2.08837082, -0.05050728]])

In [128]:
y_moons

array([1, 0, 0, ..., 1, 0, 1])

In [129]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_moons, y_moons, test_size=0.2, random_state=RANDOM_SEED)

In [130]:
from sklearn.tree import DecisionTreeClassifier

# for Decision Trees, no scaling is needed!
dt_classifier = DecisionTreeClassifier(random_state=RANDOM_SEED)

Which parameters are available?

In [131]:
pd.DataFrame(dt_classifier.get_params().keys())

Unnamed: 0,0
0,ccp_alpha
1,class_weight
2,criterion
3,max_depth
4,max_features
5,max_leaf_nodes
6,min_impurity_decrease
7,min_samples_leaf
8,min_samples_split
9,min_weight_fraction_leaf


Find a good hyperparameter setup using `GridSearchCV`

In [132]:
from sklearn.model_selection import GridSearchCV


param_grid = [
    {
        'max_depth': range(1, 8 + 1),
        #'max_features': [1, 2],
        'max_leaf_nodes': range(2, 100 + 1),
        'min_samples_split': [2, 3, 4]
    }
]

grid_search = GridSearchCV(
    dt_classifier,
    param_grid,
    cv=5,
    #scoring="neg_root_mean_squared_error"
    verbose=2
)


grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 2376 candidates, totalling 11880 fits
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=2; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=2; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=2; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=2; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=2; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=3; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=3; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=3; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=3; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=3; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, min_samples_split=4; total time=   0.0s
[CV] END .max_depth=1, max_leaf_nodes=2, mi

In [133]:
grid_search.best_score_

0.85925

In [134]:
grid_search.best_params_

{'max_depth': 7, 'max_leaf_nodes': 23, 'min_samples_split': 2}

In [135]:
grid_search.best_estimator_

dt_best = grid_search.best_estimator_
dt_best

### Accuracy?

In [136]:
cross_val_score(dt_best, X_train, y_train, cv=5, scoring="accuracy").mean()

0.85925

In [137]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train)

cross_val_score(dummy_clf, X_train, y_train, cv=5, scoring="accuracy").mean()

0.501625

### Confusion Matrix?

In [138]:
from sklearn.calibration import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(dt_best, X_train, y_train, cv=3)
cm = confusion_matrix(y_train, y_train_pred)
cm

array([[3400,  587],
       [ 577, 3436]])

### Percision and Recall?

In [139]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train, y_train_pred)

0.8540889883171763

In [140]:
recall_score(y_train, y_train_pred)

0.8562172937951658

In [141]:
from sklearn.metrics import f1_score

f1_score(y_train, y_train_pred)

0.8551518168242908

### Check against Test Set

In [142]:
y_test_pred = dt_best.predict(X_test)

In [143]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_test_pred)

0.8735

In [144]:
cm_test = confusion_matrix(y_test, y_test_pred)
cm_test

array([[875, 138],
       [115, 872]])

In [145]:
precision_score(y_test, y_test_pred)

0.8633663366336634

In [146]:
recall_score(y_test, y_test_pred)

0.8834853090172239

In [147]:
f1_score(y_test, y_test_pred)

0.8733099649474211

In [148]:
f1_score(y_train, y_train_pred) - f1_score(y_test, y_test_pred)

-0.018158148123130324

# Continuation 

In [149]:
from sklearn.model_selection import ShuffleSplit

N_TREES = 1_000
N_INSTANCES = 100

mini_sets: list[tuple[NDArray[np.float64], NDArray[np.float64]]] = []

rs = ShuffleSplit(n_splits=N_TREES, test_size=len(X_train) - N_INSTANCES, random_state=RANDOM_SEED)

for i, (train_index, test_index) in enumerate(rs.split(X_train)):
    X_mini_train = X_train[train_index]
    y_mini_train = y_train[train_index]

    mini_sets.append((X_mini_train, y_mini_train))


In [150]:
from sklearn import clone

accuracy_scores: list[float] = []
forest: list[DecisionTreeClassifier] = []

for i, (X_mini_train, y_mini_train) in enumerate(mini_sets):
    estimator: DecisionTreeClassifier = clone(dt_best)
    estimator.fit(X_mini_train, y_mini_train)
    forest.append(estimator)

    y_test_pred_mini = estimator.predict(X_test)
    mini_accuracy_score = accuracy_score(y_test, y_test_pred_mini)
    accuracy_scores.append(mini_accuracy_score)

    print(f"Run #{i + 1}: accuracy = {mini_accuracy_score}")

np.mean(accuracy_scores)


Run #1: accuracy = 0.8005
Run #2: accuracy = 0.836
Run #3: accuracy = 0.8
Run #4: accuracy = 0.8295
Run #5: accuracy = 0.7935
Run #6: accuracy = 0.8315
Run #7: accuracy = 0.798
Run #8: accuracy = 0.8015
Run #9: accuracy = 0.7665
Run #10: accuracy = 0.829
Run #11: accuracy = 0.794
Run #12: accuracy = 0.78
Run #13: accuracy = 0.7895
Run #14: accuracy = 0.808
Run #15: accuracy = 0.804
Run #16: accuracy = 0.846
Run #17: accuracy = 0.818
Run #18: accuracy = 0.811
Run #19: accuracy = 0.8175
Run #20: accuracy = 0.826
Run #21: accuracy = 0.797
Run #22: accuracy = 0.82
Run #23: accuracy = 0.7855
Run #24: accuracy = 0.819
Run #25: accuracy = 0.818
Run #26: accuracy = 0.7945
Run #27: accuracy = 0.843
Run #28: accuracy = 0.85
Run #29: accuracy = 0.8155
Run #30: accuracy = 0.811
Run #31: accuracy = 0.7845
Run #32: accuracy = 0.8355
Run #33: accuracy = 0.815
Run #34: accuracy = 0.77
Run #35: accuracy = 0.7785
Run #36: accuracy = 0.794
Run #37: accuracy = 0.792
Run #38: accuracy = 0.761
Run #39: accu

0.8017464999999999

In [151]:
len(X_test)

2000

In [152]:
from scipy.stats import mode

def forest_result(X_test: NDArray[np.float64], forest: list[DecisionTreeClassifier]) -> NDArray[np.int32]:
    Y_pred = np.empty([N_TREES, len(X_test)], dtype=np.uint8)

    for tree_index, tree in enumerate(forest):
        Y_pred[tree_index] = tree.predict(X_test)

    y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

    return y_pred_majority_votes


y_pred_majority_votes = forest_result(X_test, forest)
y_pred_majority_votes

array([1, 1, 0, ..., 0, 0, 0])

In [153]:
accuracy_score(y_test, y_pred_majority_votes)

0.8735