In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

In [4]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [3]:
credit = pd.read_csv('./uci_credit_card.csv')

In [6]:
X = credit.loc[:, 'LIMIT_BAL':'PAY_AMT6']
X = pd.get_dummies(X, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)
y = credit['default.payment.next.month']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=1111)

# Introducing Grid Search

## Build Grid Search functions

In [8]:
# Create the function
def gbm_grid_search(learn_rate, max_depth):

    # Create the model
    model = GradientBoostingClassifier(learning_rate=learn_rate, max_depth=max_depth)
    
    # Use the model to make predictions
    predictions = model.fit(X_train, y_train).predict(X_test)
    
    # Return the hyperparameters and score
    return([learn_rate, max_depth, accuracy_score(y_test, predictions)])

In [9]:
# Create the relevant lists
results_list = []
learn_rate_list = [0.01, 0.1, 0.5]
max_depth_list = [2, 4, 6]

# Create the for loop
for learn_rate in learn_rate_list:
    for max_depth in max_depth_list:
        results_list.append(gbm_grid_search(learn_rate, max_depth))

# Print the results
print(results_list)   

[[0.01, 2, 0.819], [0.01, 4, 0.8183333333333334], [0.01, 6, 0.8153333333333334], [0.1, 2, 0.823], [0.1, 4, 0.8227777777777778], [0.1, 6, 0.8201111111111111], [0.5, 2, 0.8211111111111111], [0.5, 4, 0.8086666666666666], [0.5, 6, 0.7922222222222223]]


# Grid Search with `sklearn`

In [14]:
# Create a Random Forest Classifier with specified criterion
rf_class = RandomForestClassifier(criterion='entropy')

# Create the parameter grid
param_grid = {'max_depth': [2, 4, 8, 15], 
              'max_features': ['auto', 'sqrt']} 

# Create a GridSearchCV object
grid_rf_class = GridSearchCV(estimator=rf_class,
                             param_grid=param_grid,
                             scoring='roc_auc',
                             n_jobs=-1,
                             cv=5,
                             refit=True, 
                             return_train_score=True)
print(grid_rf_class)

GridSearchCV(cv=5, estimator=RandomForestClassifier(criterion='entropy'),
             n_jobs=-1,
             param_grid={'max_depth': [2, 4, 8, 15],
                         'max_features': ['auto', 'sqrt']},
             return_train_score=True, scoring='roc_auc')


# Understanding a Grid Search output

## Exploring the grid search results

In [15]:
grid_rf_class.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(criterion='entropy'),
             n_jobs=-1,
             param_grid={'max_depth': [2, 4, 8, 15],
                         'max_features': ['auto', 'sqrt']},
             return_train_score=True, scoring='roc_auc')

In [17]:
# Read the cv_results property into a dataframe & print it out
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)

# Extract and print the column with a dictionary of hyperparameters used
column = cv_results_df.loc[:, ['params']]
print(column)

# Extract and print the row that had the best mean test score
best_row = cv_results_df[cv_results_df['rank_test_score'] == 1 ]
print(best_row['mean_test_score'])

                                      params
0   {'max_depth': 2, 'max_features': 'auto'}
1   {'max_depth': 2, 'max_features': 'sqrt'}
2   {'max_depth': 4, 'max_features': 'auto'}
3   {'max_depth': 4, 'max_features': 'sqrt'}
4   {'max_depth': 8, 'max_features': 'auto'}
5   {'max_depth': 8, 'max_features': 'sqrt'}
6  {'max_depth': 15, 'max_features': 'auto'}
7  {'max_depth': 15, 'max_features': 'sqrt'}
7    0.778559
Name: mean_test_score, dtype: float64


## Analyzing the best results

In [22]:
# Print out the ROC_AUC score from the best-performing square
best_score = grid_rf_class.best_score_
print(f'Best score: {best_score}')

# Create a variable from the row related to the best-performing square
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
best_row = cv_results_df.loc[cv_results_df['rank_test_score']==1, :]

# Get the max_depth parameter from the best-performing square and print
best_n_estimators = grid_rf_class.best_params_["max_depth"]
print(f'Max depth of best estimator: {best_n_estimators}')

Best score: 0.7785585497237524
Max depth of best estimator: 15


## Using the best results

In [24]:
# See what type of object the best_estimator_ property is
print(type(grid_rf_class.best_estimator_))

# Create an array of predictions directly using the best_estimator_ property
predictions = grid_rf_class.best_estimator_.predict(X_test)

# Take a look to confirm it worked, this should be an array of 1's and 0's
print(predictions[0:5])

# Now create a confusion matrix 
print("Confusion Matrix \n", confusion_matrix(y_test, predictions))

# Get the ROC-AUC score
predictions_proba = grid_rf_class.best_estimator_.predict_proba(X_test)[:,1]
print("ROC-AUC Score \n", roc_auc_score(y_test, predictions_proba))

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
[0 0 0 0 0]
Confusion Matrix 
 [[6670  339]
 [1272  719]]
ROC-AUC Score 
 0.7811400051838351
