In [18]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from numpy.random import randint

data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Grid Search

For now, we won't split the data into train and test, because we just want to show how GridSearchCV works.

In [19]:
# We setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15) # parameters are: start, stop, number of samples to generate
param_grid = {'C': c_space}

In [20]:
# We instantiate the logreg classifier
logreg = LogisticRegression(solver='liblinear')

In [21]:
# We instantiate the GridSearchCV object
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

In [22]:
# We fit it to the data
logreg_cv.fit(X, y)

# What are the results?

print('Tuned Logistic Regression Parameters: {}'.format(logreg_cv.best_params_))
print('Best score is {}'.format(logreg_cv.best_score_))

Tuned Logistic Regression Parameters: {'C': 100000000.0}
Best score is 0.961335676625659


Is it overfitted? As stated before, we don't care about that just now. We just want to see how GridSearchCV works. 

GridSearchCV can be computationally expensive, especially if we are searching over a large hyperparameter space and dealing with multiple hyperparameters.

A solution to this is to use RandomizedSearchCV, in which not all hyperparameter valies are tried out. Instead, a fixed number of hyperparameter settings is sampled from specific probability distributions. 

RandomizedSearchCV will never outperform GridSearchCV. Instead, it is valuable because it saves on computation time.

Let's take a look.

# RandomizedSearchCV

We will try it out with a new model: the Decision Tree. Decision trees have many parameters that can be tuned, such as max_features, max_depth and min_samples_leaf. This makes it an ideal use case for RandomizedSearchCV.

We are going to use RandomizedSearchCV to find the optimal hyperparameters.

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

In [32]:
# Setup the parameters and distributions to sample from:

param_dist = {'max_depth': [3, None],
             'max_features': [3, 6],
             'min_samples_leaf': [3, 4],
             'criterion': ['gini', 'entropy']}

# Instantiate a Decision Tree classifier

tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object:

tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data

tree_cv.fit(X, y)

# Print the tuned parameters and score

print('Tuned Decision Tree Parameters: {}'.format(tree_cv.best_params_))
print('Best score is {}'.format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'min_samples_leaf': 4, 'max_features': 6, 'max_depth': None, 'criterion': 'entropy'}
Best score is 0.9490333919156415


