In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [None]:
digits = datasets.load_digits()

X = digits.data

y = digits.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Random Forest Classifier

In [None]:
randomForest = RandomForestClassifier()
randomForest.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = randomForest.predict(X_test)
F1_score = f1_score(y_test,y_pred, average='weighted')
print("F1 Score: ", '%.2f'% (F1_score*100),"%")

F1 Score:  97.59 %


GridSearchCV on RandomForestClassifier

In [None]:
grid_params = {
    'bootstrap' : [True, False],
    'max_depth' : [1,2,3],
    'n_estimators' : [10,50,100,200,500],
    'max_features' : [1,2,3,4,5,6,7,8]
}

gs = GridSearchCV(
    randomForest,
    grid_params,
    cv=5
)

gs_results = gs.fit(X_train, y_train)

In [None]:
gs_results.best_score_

0.9029469423891735

In [None]:
gs_results.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
gs_results.best_params_

{'bootstrap': True, 'max_depth': 3, 'max_features': 3, 'n_estimators': 500}

In [None]:
randomForest_f1 = RandomForestClassifier(bootstrap=True, max_depth = 3, max_features = 3, n_estimators=500)
randomForest_f1.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = randomForest_f1.predict(X_test)

In [None]:
F1_score = f1_score(y_test, y_pred, average='weighted')

In [None]:
print("F1 Score: ", '%.2f'% (F1_score*100),"%")

F1 Score:  91.00 %


XGBClassifier

In [None]:
from xgboost import XGBClassifier

xgboost = XGBClassifier()
xgboost.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = xgboost.predict(X_test)
F1_score = f1_score(y_test, y_pred, average='weighted')
print("F1 Score: ", '%.2f'% (F1_score*100),"%")

F1 Score:  96.49 %


GridSearchCV on XGBClassifier

In [None]:
grid_params = {
    'learning_rate' : [0.1,0.2,1],
    'max_depth' : [1,2,3],
    'n_estimators' : [10,50,100,200,500]
}

gs = GridSearchCV(
    xgboost,
    grid_params,
    cv=5
)

gs_results = gs.fit(X_train, y_train)

In [None]:
gs_results.best_score_

0.9546638841459558

In [None]:
gs_results.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
gs_results.best_params_

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 500}

In [None]:
xgboost_f1 = XGBClassifier(learning_rate=0.1, max_depth=2, n_estimators=500)
xgboost_f1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = xgboost_f1.predict(X_test)

In [None]:
F1_score = f1_score(y_test, y_pred, average='weighted')

In [None]:
print("F1 Score: ", '%.2f'% (F1_score*100),"%")

F1 Score:  97.41 %


RandomizedSearchCV on RandomForestClassifier

In [None]:
from sklearn.model_selection import RandomizedSearchCV

randomized_params = {
    'bootstrap' : [True, False],
    'max_depth' : [1,2,3],
    'n_estimators' : [10,50,100,200,500],
    'max_features' : [1,2,3,4,5,6,7,8]
}

rs = RandomizedSearchCV(
    randomForest,
    randomized_params,
    cv=5,
    scoring='accuracy',
    n_iter=25, 
    random_state=10
)

rs_results = rs.fit(X_train, y_train)

In [None]:
rs_results.best_score_

0.8926136722949473

In [None]:
rs_results.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
rs_results.best_params_

{'bootstrap': True, 'max_depth': 3, 'max_features': 4, 'n_estimators': 200}

In [None]:
randomForest_f1 = RandomForestClassifier(bootstrap=True, max_depth = 3, max_features = 4, n_estimators=200)
randomForest_f1.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = randomForest_f1.predict(X_test)

In [None]:
F1_score = f1_score(y_test, y_pred, average='weighted')
print("F1 Score: ", '%.2f'% (F1_score*100),"%")

F1 Score:  90.47 %


RandomizedSearchCV on XGBClassifier

In [None]:
randomized_params = {
    'learning_rate' : [0.1,0.2,1],
    'max_depth' : [1,2,3],
    'n_estimators' : [10,50,100,200,500]
}

rs = RandomizedSearchCV(
    xgboost,
    randomized_params,
    cv=5,
    scoring='accuracy',
    n_iter=25, 
    random_state=10
)

rs_results = rs.fit(X_train, y_train)

In [None]:
rs_results.best_score_

0.9538607474862456

In [None]:
rs_results.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
rs_results.best_params_

{'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 200}

In [None]:
xgboost_f1 = XGBClassifier(learning_rate= 0.2, max_depth=2, n_estimators=200)
xgboost_f1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.2, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = xgboost_f1.predict(X_test)

In [None]:
F1_score = f1_score(y_test, y_pred, average='weighted')
print("F1 Score: ", '%.2f'% (F1_score*100),"%")

F1 Score:  96.67 %




*   The randomized search and the grid search explore exactly the same space of parameters. The result in parameter settings is quite similar, while the run time for randomized search is drastically lower.
*   The performance may slightly worse for the randomized search, and is likely due to a noise effect and would not carry over to a held-out test set.





*   Grid search is thorough and will yield the most optimal results based on the training data — however, it does have some flaws: (1) it is time-consuming, depending on the size of your dataset and the number of hyperparameters. (2) it could lead to overfitting of the training set, leading to a less viable model in the long run.
*   Randomized search selects a random sampling of hyperparameter combinations, reduces the danger of overfitting, and is likely to provide more accurate long term results — especially when there are a smaller number of significant hyperparameters.

