# Python Programming: Grid Search

## Example

In [None]:
## Example 1
# ---
# Perform hyperparameter tuning then predict the quality of wine using Grid Search. 
# ---
# Dataset url = http://bit.ly/TuningDataset
# ---
# OUR CODE GOES BELOW 

In [None]:
# Importing the required libraries
# ---
#
import pandas as pd
import numpy as np

In [None]:
# Importing our Dataset
# ---
#
dataset = pd.read_csv("http://bit.ly/TuningDataset", sep=';')

In [None]:
# Previewing our Dataset
# ---
#
dataset.head()

In [None]:
# Performing Data Preprocessing
# ---
# 
X = dataset.iloc[:, 0:11].values
y = dataset.iloc[:, 11].values

In [None]:
# Performing Data Preprocessing
# ---
# 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Scaling our Data
# ---
# 
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [None]:
# Training and Cross Validation
# ---
# 
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300, random_state=0)

In [None]:
# Training and Cross Validation
# ---
# Next, to implement cross validation, the cross_val_score method 
# of the sklearn.model_selection library can be used. 
# The cross_val_score returns the accuracy for all the folds. 
# Values for 4 parameters are required to be passed to the cross_val_score class. 
# The first parameter is estimator which basically specifies 
# the algorithm that you want to use for cross validation. 
# The second and third parameters, X and y, contain the X_train and y_train data i.e. features and labels. 
# Finally the number of folds is passed to the cv parameter as shown in the following code
# ---
# 
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)

In [None]:
# Printing the accuracies returned for five folds 
# by the cross_val_score method by calling print on all_accuracies
# ---
#
print(all_accuracies.mean())

In [None]:
# Step 1: Hyperparameters: Getting Started with Grid Search
# ---
# We create a dictionary of all the parameters and their corresponding 
# set of values that you want to test for best performance. 
# The name of the dictionary items corresponds to the parameter name 
# and the value corresponds to the list of values for the parameter.
# As shown grid_param dictionary with three parameters n_estimators, criterion, and bootstrap. 
# The parameter values that we want to try out are passed in the list. 
# For instance, in the above script we want to find which value 
# (out of 100, 300, 500, 800, and 1000) provides the highest accuracy. 
# Similarly, we want to find which value results in the 
# highest performance for the criterion parameter: "gini" or "entropy"? 
# The Grid Search algorithm basically tries all possible combinations 
# of parameter values and returns the combination with the highest accuracy. 
# For instance, in the above case the algorithm will check 20 combinations (5 x 2 x 2 = 20).
# ---
# 
grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [None]:
# Step 2: Instantiating GridSearchCV object
# ---
# Once the parameter dictionary is created, the next step 
# is to create an instance of the GridSearchCV class. 
# We need to pass values for the estimator parameter, 
# which basically is the algorithm that you want to execute. 
# The param_grid parameter takes the parameter dictionary 
# that we just created as parameter, the scoring parameter 
# takes the performance metrics, the cv parameter corresponds 
# to number of folds, which is 5 in our case, and finally 
# the n_jobs parameter refers to the number of CPU's that we want to use for execution. 
# A value of -1 for n_jobs parameter means that use all available computing power.
# ---
# 
from sklearn.model_selection import GridSearchCV
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [None]:
# Step 3: Calling the fit method
# ---
# Once the GridSearchCV class is initialized, we call the fit method of the class 
# and pass it the training and test set, as shown in the following code.
# The method might take abit of some time to execute. 
# This is the drawback - GridSearchCV will go through all the intermediate 
# combinations of hyperparameters which makes grid search computationally very expensive.
# ---
# 
gd_sr.fit(X_train, y_train)

In [None]:
# Step 4: Checking the parameters that return the highest accuracy
# --- 
# To do so, we print the sr.best_params_ attribute of the GridSearchCV object, as shown below:
# ---
# 
best_parameters = gd_sr.best_params_
print(best_parameters)

# The result shows that the highest accuracy is achieved 
# when the n_estimators are 300, bootstrap is True and criterion is "gini". 
# It would be a good idea to add more number of estimators 
# and see if performance further increases since the highest 
# allowed value of n_estimators was chosen.

In [None]:
# Step 5: Finding the obtained accuracy
# ---
# The last and final step of Grid Search algorithm is 
# to find the accuracy obtained using the best parameters. 
# Previously we had a mean accuracy of 64.22%.
# To find the best accuracy achieved, we execute the following code:
# ---
# 
best_result = gd_sr.best_score_
print(best_result)

# The accuracy achieved is: 0.6505 of 65.05% which is only slightly better than 64.22%. 
# To improve this further, it would be good to test values for other parameters 
# of Random Forest algorithm, such as max_features, max_depth, max_leaf_nodes, etc. 
# to see if the accuracy further improves or not.

## <font color="green">Challenges</font>

In [None]:
## Challenge 1
# ---
# Question: Implement hyperparameter tuning upon creating a model to classify 
# incomes of persons given the following dataset.
# ---
# Dataset url = http://bit.ly/HyperParameterTuningDataset
# ---
# OUR CODE GOES BELOW
#