In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [11]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, precision_score

In [12]:
# datasets used
sports = pd.read_csv('./sports.csv')

# Introduction to hyperparameter tuning

## Creating hyperparameters

In [4]:
rfr = RandomForestRegressor()

# review the params of rfr
print(rfr.get_params())

# create a list for some hyperparams
max_depth = [4, 8, 12]
min_samples_split = [2, 5, 10]
max_features = [4, 6, 8, 10]

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


## Running a model using ranges

In [6]:
# fill in rfr using the list of hyperparams
rfr = RandomForestRegressor(n_estimators=100,
                            max_depth=np.random.choice(max_depth),
                            min_samples_split=np.random.choice(min_samples_split),
                            max_features=np.random.choice(max_features))

# Print out the parameters
print(rfr.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 12, 'max_features': 4, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


# `RandomizedSearchCV`

## Preparing for RandomizedSearch

In [9]:
# create a dict of param distributions
param_dist = {'max_depth': [2, 4, 6, 8],
              'max_features': [2, 4, 6, 8, 10],
              'min_samples_split': [2, 4, 8, 16]}

# create a random forest regressor
rfr = RandomForestRegressor(n_estimators=10,
                            random_state=1111)

# create a scorer
scorer = make_scorer(mean_squared_error)

## Implementing `RandomizedSearchCV`

In [10]:
# build a random search using the created objs
random_search = RandomizedSearchCV(estimator=rfr,
                                   param_distributions=param_dist,
                                   n_iter=10,
                                   cv=5,
                                   scoring=scorer)

# Selecting your final model

In [13]:
print(sports.shape)
sports.head()

(958, 28)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,win
0,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,1,0,1
1,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,1,0,1
2,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,1,1
3,0,0,1,0,0,1,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1
4,0,0,1,0,0,1,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1


In [14]:
X = sports.drop('win', axis=1)
y = sports['win']

In [27]:
# create a precision scorer
precision = make_scorer(precision_score) # we want to focus on model precision

# finalize the random search
rfc = RandomForestClassifier(random_state=1111)
param_dist = {'max_depth': range(2, 12, 2),
              'min_samples_split': range(2, 12, 2),
              'n_estimators': [10, 25, 50]}

rs = RandomizedSearchCV(estimator=rfc,
                        param_distributions=param_dist,
                        scoring=precision,
                        cv=5,
                        n_iter=10,
                        random_state=1111)
rs.fit(X, y)

print(f"accuracy for each run:\n{rs.cv_results_['mean_test_score']}\n")
print(f'best accuracy: {rs.best_score_: .1%}')

accuracy for each run:
[0.87614978 0.75561877 0.67740077 0.89141614 0.87024051 0.85772772
 0.68244199 0.82867397 0.88717239 0.91980724]

best accuracy:  92.0%
