# Decision Trees Exercises

Ex. 7
Train Decision Tree model to fit the moon data.

In [29]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise = 0.4, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.metrics import get_scorer_names
get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
            "max_depth": [2, 3, 5, 10, 15, 25, 100],
            "min_samples_leaf": [1, 5],
            "min_samples_split": [2, 3, 5, 10],
            "max_leaf_nodes": [2, 5, 10, 15, 20],
            "random_state": [8, 42]
        }    

tree_clf = DecisionTreeClassifier()
grid_search = GridSearchCV(tree_clf, params, cv=5, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 560 candidates, totalling 2800 fits


In [32]:
import numpy as np
from sklearn.metrics import accuracy_score


print(grid_search.best_params_)
print(grid_search.best_score_)

best_tree_clf = grid_search.best_estimator_

y_pred = best_tree_clf.predict(X_test)
accuracy_score(y_test, y_pred)

{'max_depth': 10, 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 8}
0.858625


0.87

Ex. 8

In [33]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=1000, train_size=100, random_state=42)

preds = []

for train_index, _ in rs.split(X_train):
    X_new = X_train[train_index]
    y_new = y_train[train_index]

    tree_clf = DecisionTreeClassifier(**grid_search.best_params_)
    tree_clf.fit(X_new, y_new)
    
    y_pred = best_tree_clf.predict(X_test)
    preds.append(y_pred)
    
predictions = np.asarray(preds)

In [34]:
from scipy.stats import mode

y_pred = mode(predictions, axis=0, keepdims=False)[0]

In [35]:
accuracy_score(y_pred=y_pred, y_true=y_test)

0.87