In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from itertools import product
import random
from hyperopt import hp, fmin, tpe
from tpot import TPOTClassifier

In [3]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [4]:
credit = pd.read_csv('./uci_credit_card.csv')

In [5]:
X = credit.loc[:, 'LIMIT_BAL':'PAY_AMT6']
X = pd.get_dummies(X, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True)
y = credit['default.payment.next.month']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=1111)

# Informed Search: Coarse to Fine

## Visualizing Coarse to Fine

## Coarse to Fine iterations

Narrow down initial parameter grid by running Random Search and adjust ranges accordingly by inspecting the accuracies plotted against the individual hyperparameters.

# Informed Search: Bayesian Statistics

## Bayes Rule in Python

In [46]:
# Assign probabilities to variables 
p_unhappy = 0.15
p_unhappy_close = 0.35

# Probabiliy someone will close
p_close = 0.07

# Probability unhappy person will close
p_close_unhappy = (p_unhappy_close * p_close) / p_unhappy
print(p_close_unhappy)

0.16333333333333336


## Bayesian hyperparameter tuning with `hyperopt`

In [58]:
# set up space dict with specified hyperparams
space = {'max_depth': hp.quniform('max_depth', 2, 10, 2),
         'learning_rate': hp.uniform('learning_rate', 0.001, 0.9)}

# set up objective function
def objective(params):
    params = {'max_depth': int(params['max_depth']),
              'learning_rate': params['learning_rate']}
    gbm_clf = GradientBoostingClassifier(n_estimators=100, **params)
    best_score = cross_val_score(gbm_clf, X_train, y_train,
                                 scoring='accuracy',
                                 cv=2,
                                 n_jobs=-1).mean()
    loss = 1 - best_score
    
    return loss

In [59]:
# run the algorithm
best = fmin(fn=objective,
            space=space,
            max_evals=20,
            rstate=np.random.RandomState(42),
            algo=tpe.suggest)
print(best)

100%|███████████████████████████████████████████████| 20/20 [04:11<00:00, 12.55s/trial, best loss: 0.18399999999999994]
{'learning_rate': 0.0128515490384306, 'max_depth': 6.0}


# Informed Search: Genetic Algorithms

## Genetic hyperparameter tuning with `tpot`

In [61]:
# create a TPOT Classifier
tpot_clf = TPOTClassifier(generations=3,
                          population_size=4,
                          offspring_size=3,
                          scoring='accuracy',
                          verbosity=2,
                          random_state=2,
                          cv=2)

# fit
tpot_clf.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=13.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.8190952380952381
Generation 2 - Current best internal CV score: 0.8190952380952381
Generation 3 - Current best internal CV score: 0.8190952380952381
Best pipeline: DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=3, min_samples_leaf=10, min_samples_split=9)


TPOTClassifier(cv=2, generations=3,
               log_file=<ipykernel.iostream.OutStream object at 0x000001EB2E6CD4C8>,
               offspring_size=3, population_size=4, random_state=2,
               scoring='accuracy', verbosity=2)

In [64]:
# eval
print(f'test acc: {tpot_clf.score(X_test, y_test): .1%}')

test acc:  82.2%


# Analyzing `tpot`'s stability

In [65]:
# create a TPOT Classifier
tpot_clf = TPOTClassifier(generations=3,
                          population_size=4,
                          offspring_size=3,
                          scoring='accuracy',
                          verbosity=2,
                          random_state=42, # change random state
                          cv=2)

# fit
tpot_clf.fit(X_train, y_train)

# eval
print(tpot_clf.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=13.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.8193809523809523
Generation 2 - Current best internal CV score: 0.8193809523809523
Generation 3 - Current best internal CV score: 0.8197142857142857
Best pipeline: XGBClassifier(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), learning_rate=0.001, max_depth=9, min_child_weight=7, n_estimators=100, nthread=1, subsample=0.45)
0.824


In [66]:
# create a TPOT Classifier
tpot_clf = TPOTClassifier(generations=3,
                          population_size=4,
                          offspring_size=3,
                          scoring='accuracy',
                          verbosity=2,
                          random_state=122, # change random state
                          cv=2)

# fit
tpot_clf.fit(X_train, y_train)

# eval
print(f'test acc: {tpot_clf.score(X_test, y_test): .1%}')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=13.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.7785238095238095
Generation 2 - Current best internal CV score: 0.7787619047619048
Generation 3 - Current best internal CV score: 0.815
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.2, min_samples_leaf=4, min_samples_split=4, n_estimators=100)
test acc:  81.7%
