In [14]:
import numpy as np
import pandas as pd
import pyspark

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state

In [3]:
from hyperopt import fmin, hp, tpe
from hyperopt import SparkTrials, STATUS_OK

In [4]:
import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def objective(x):
    return {
        'loss': x ** 2,
        'status': STATUS_OK,
        # -- store other results like this
        'eval_time': time.time(),
        'other_stuff': {'type': None, 'value': [0, 1, 2]},
        # -- attachments are handled differently
        'attachments':
            {'time_module': pickle.dumps(time.time)}
        }
trials = Trials()
best = fmin(objective,
    space=hp.uniform('x', -10, 10),
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
           verbose = 1)

print(best)

100%|██████████| 100/100 [00:00<00:00, 314.74trial/s, best loss: 2.263767105238341e-05]
{'x': -0.004757906162629041}


In [5]:
# Load MNIST data, and preprocess it by standarizing features.
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [6]:
random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])

In [7]:
X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=5000, test_size=10000)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# First, set up the scikit-learn workflow, wrapped within a function.

def train(params):
    """
    This is our main training function which we pass to Hyperopt.
    It takes in hyperparameter settings, fits a model based on those settings,
    evaluates the model, and returns the loss.

    :param params: map specifying the hyperparameter settings to test
    :return: loss for the fitted model
    """
  # We will tune 2 hyperparameters:
  #  regularization and the penalty type (L1 vs L2).
    regParam = float(params['regParam'])
    penalty = params['penalty']

  # Turn up tolerance for faster convergence
    clf = LogisticRegression(C=1.0 / regParam,
                           multi_class='multinomial',
                           penalty=penalty, solver='saga', tol=0.1)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)

    return {'loss': -score, 'status': STATUS_OK}

In [9]:
# Next, define a search space for Hyperopt.
search_space = {
  'penalty': hp.choice('penalty', ['l1', 'l2']),
  'regParam': hp.loguniform('regParam', -10.0, 0),
}

# Select a search algorithm for Hyperopt to use.
algo=tpe.suggest  # Tree of Parzen Estimators, a Bayesian method

In [None]:
# We can run Hyperopt locally (only on the driver machine)
# by calling `fmin` without an explicit `trials` argument.
best_hyperparameters = fmin(
  fn=train,
  space=search_space,
  algo=algo,
  max_evals=32)
best_hyperparameters

In [10]:
# We can distribute tuning across our Spark cluster
# by calling `fmin` with a `SparkTrials` instance.
spark_trials = SparkTrials()
best_hyperparameters = fmin(
  fn=train,
  space=search_space,
  algo=algo,
  trials=spark_trials,
  max_evals=32)
best_hyperparameters

100%|██████████| 32/32 [02:19<00:00,  4.36s/trial, best loss: -0.8717]


Total Trials: 32: 32 succeeded, 0 failed, 0 cancelled.


{'penalty': 1, 'regParam': 0.17379454650982248}

In [None]:
# We can distribute tuning across our Spark cluster
# by calling `fmin` with a `SparkTrials` instance.

best_hyperparameters = fmin(
  fn=train,
  space=search_space,
  algo=algo,
  trials=spark_trials,
  max_evals=32)
best_hyperparameters