In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Here we'll practice hyperparameter tuning on a decision tree model

## Data Creation

In [4]:
#use the make_classification library to create a dummy dataset we will use to classify with
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=5000, n_features=10, n_redundant=2, n_informative=5, 
                           n_clusters_per_class=2, flip_y=0.09, class_sep=1.1, random_state=4184)

In [5]:
X.shape
#5000 instances, 10 columns

(5000, 10)

In [7]:
#split into train/test matrices
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
#check X_train shape - should be 4000 instances (we've grabbed 80%)
X_train.shape

(4000, 10)

In [20]:
y_train.shape
#here are our training labels

(4000,)

## Creating Basic Decision Tree Classifier

In [11]:
#import decision tree algorithm
from sklearn.tree import DecisionTreeClassifier

#create the decision tree classifier (using some random values for hyperparameters here)
clf = DecisionTreeClassifier(random_state=42, criterion="entropy", min_samples_split=10, max_depth=5)

In [12]:
clf
#this displays ALL the hyperparameter values for the ML algorithm (defaults + the ones we've manually specified above)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [13]:
#fit the decision tree classifier to our data
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [15]:
#let's create predictions using our model on our test set
y_pred = clf.predict(X_test)

In [16]:
y_pred #here they are

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,

In [24]:
#let's look at our model's performance
from sklearn.metrics import confusion_matrix, classification_report, f1_score

print(confusion_matrix(y_test, y_pred))

[[432  63]
 [189 316]]


In [25]:
print(classification_report(y_test, y_pred))

#observations: we are currently getting 75% accuracy; better performance predicting 0's than 1's

#also get F1 score...71%
print("F1 score = {:.2f}".format(f1_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.70      0.87      0.77       495
           1       0.83      0.63      0.71       505

    accuracy                           0.75      1000
   macro avg       0.76      0.75      0.74      1000
weighted avg       0.77      0.75      0.74      1000

F1 score = 0.71


## Using GridSearch to Tune

In [21]:
#import gridsearch
from sklearn.model_selection import GridSearchCV

In [22]:
#create a baseline classifier
base_clf = DecisionTreeClassifier(splitter="best", class_weight=None, random_state=42)
    #notice we haven't specified any of the hyperparameters we did initially (e.g., min_samples_split, max_depth)

In [29]:
#MOST IMPORTANT PART: Here we specify what values we want to test for each hyperparameter of interest
parameters_grid = {'criterion': ('gini', 'entropy'),
                 'max_depth': [2,10,50,100,200], 
                 'max_leaf_nodes': [None,5,10,50,100],
                 'min_impurity_decrease': [0,0.1,0.2],
                 'min_samples_leaf': [1,10,50],
                 'min_samples_split': [2,10,50]}

#criterion: decides what metric the decision tree uses to determine splits
#max_depth: controls how many levels the tree is allowed to grow (can control overfitting)
#max_leaf_nodes: controls how many leaf nodes are allowed to be created (can control overfitting)
#min_impurity_decrease: controls when the algorithm is allowed to split the node
#min_samples_leaf: controls how many instances need to be in a leaf
#min_samples_split: #how many instances need to be in the node before splitting can occur again

In [32]:
#apply grid search to our baseline model
    #note that we are using F1 for scoring and 3 repetitions of 5-fold cross-validation
clf_gs = GridSearchCV(base_clf, param_grid=parameters_grid, scoring="f1", n_jobs=3, cv=5, return_train_score=True)

In [34]:
#fit it to our training data
%time clf_gs.fit(X_train, y_train) #%time command shows the amount of time that was required to run this line of code

Wall time: 34.7 s


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=3,
             param_grid={'criterion': ('gini', 'entropy'),
                        

In [35]:
#KEY: now we want to see what combination of hyperparameters resulted in the best performance
#this shows us the best parameters
clf_gs.best_params_
#observations: looks like the tree wants to get pretty big (max_depth = 50 and max_leaf_nodes = 100)

{'criterion': 'entropy',
 'max_depth': 50,
 'max_leaf_nodes': 100,
 'min_impurity_decrease': 0,
 'min_samples_leaf': 10,
 'min_samples_split': 2}

In [36]:
#now let's use this tuned model to create predictions on our test set
y_pred_gs = clf_gs.predict(X_test)

In [37]:
y_pred_gs #here they are

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,

In [39]:
#evaluate our performance
print(confusion_matrix(y_test, y_pred_gs))
print(classification_report(y_test, y_pred_gs))
print("F1 score = {:.2f}".format(f1_score(y_test, y_pred_gs)))

#observations: Accuracy has increased from 75% to 80% and F1-score has increased from 71% to 80%!
#also appears to have balanced out performance between predicting the two classes

[[405  90]
 [111 394]]
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       495
           1       0.81      0.78      0.80       505

    accuracy                           0.80      1000
   macro avg       0.80      0.80      0.80      1000
weighted avg       0.80      0.80      0.80      1000

F1 score = 0.80


## Using RandomSearch to Tune

In [43]:
#import randomsearch
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [65]:
#MOST IMPORTANT PART: Here we specify what values we want to test for each hyperparameter of interest
parameters_dist = {'criterion': ('gini', 'gini', 'entropy'), #here we are making gini twice as likely to be selected (personal preference)
                 'max_depth': randint(2,200), #will obtain random integer between 2 and 200
                 'max_features': [None, "auto"],
                 'max_leaf_nodes': randint(5,500),
                 'min_impurity_decrease': uniform(0.0, 0.5),
                 'min_samples_leaf': randint(2,50),
                 'min_samples_split': randint(2,50)}

In [66]:
#apply random search to our baseline model
    #note that we are using F1 for scoring and 3 repetitions of 5-fold cross-validation
clf_rs = RandomizedSearchCV(base_clf, param_distributions=parameters_dist, scoring="f1", n_jobs=3, cv=5, 
                            return_train_score=True, n_iter=5000)
    #n_iter controls how many random combinations our random search will try

In [67]:
#fit it to our training data
%time clf_rs.fit(X_train, y_train) #%time command shows the amount of time that was required to run this line of code

Wall time: 1min 11s


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=42,
            

In [68]:
#KEY: now we want to see what combination of hyperparameters resulted in the best performance
#this shows us the best parameters
clf_rs.best_params_
#observations: notice some of the integer selections are much more precise (not limited to the nice round numbers we provided in gridsearch)

{'criterion': 'entropy',
 'max_depth': 104,
 'max_features': None,
 'max_leaf_nodes': 221,
 'min_impurity_decrease': 0.0013991318543813414,
 'min_samples_leaf': 4,
 'min_samples_split': 7}

In [69]:
#now let's use this tuned model to create predictions on our test set
y_pred_rs = clf_rs.predict(X_test)

In [70]:
y_pred_rs #here they are

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,

In [71]:
#evaluate our performance
print(confusion_matrix(y_test, y_pred_rs))
print(classification_report(y_test, y_pred_rs))
print("F1 score = {:.2f}".format(f1_score(y_test, y_pred_rs)))

#observations: Accuracy has increased from 75% to 79% and F1-score has increased from 71% to 78%!
#also appears to have balanced out performance between predicting the two classes
#overall - appears as though grid search resulted in better accuracy in this specific scenario
#(note that results from random search are subject to change...due to its randomness)

[[407  88]
 [124 381]]
              precision    recall  f1-score   support

           0       0.77      0.82      0.79       495
           1       0.81      0.75      0.78       505

    accuracy                           0.79      1000
   macro avg       0.79      0.79      0.79      1000
weighted avg       0.79      0.79      0.79      1000

F1 score = 0.78
