In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

Workspace.create(name='AzureML', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-eastus-rg')

In [2]:
import mlflow

mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment("lightgbm-mlflow-optuna-dask")

In [3]:
# imports
import os
import time
import mlflow
import argparse

import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

  and should_run_async(code)


In [4]:
# define functions
def preprocess_data(df):
    X = df.drop(["species"], axis=1)
    y = df["species"]

    enc = LabelEncoder()
    y = enc.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, enc


def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):
    t1 = time.time()
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[test_data],
        valid_names=["test"],
    )
    t2 = time.time()

    return model, t2 - t1


def evaluate_model(model, X_test, y_test):
    y_proba = model.predict(X_test)
    y_pred = y_proba.argmax(axis=1)
    loss = log_loss(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)

    return loss, acc

  and should_run_async(code)


In [5]:
# define an objective for optuna to optimize 
def objective(trial):
    try:
        #mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
        #mlflow.set_experiment("lightgbm-mlflow-optuna-dask")
        # start mlflow run
        with mlflow.start_run():
            # enable autologging
            mlflow.lightgbm.autolog()

            # generate parameters 
            num_boost_round = trial.suggest_int("num_boost_round", 1, 100)
            params = {
                "objective": "multiclass",
                "num_class": 3,
                "boosting":  trial.suggest_categorical(
                    "boosting", ["gbdt", "dart", "goss"]
                ),
                "num_iterations": trial.suggest_int("num_iterations", 10, 100),
                "num_leaves": trial.suggest_int("num_leaves", 15, 63),
                #"num_threads": trial.suggest_categorical("num_threads", [1, 2, 4]),
                "learning_rate": trial.suggest_loguniform("learning_rate", 10e-5, .1),
                "metric": "multi_logloss",
                #"seed": trial.suggest_categorical("seed", [1, 3, 5, 7, 11, 13, 42]),
                "verbose": 0,
            }
            mlflow.log_param("num_boost_round", num_boost_round)
            mlflow.log_params(params)

            # read in dataset 
            df = pd.read_csv("../data/iris/iris.csv")

            # preprocess data
            X_train, X_test, y_train, y_test, enc = preprocess_data(df)

            # train model
            model, train_time = train_model(
                params, num_boost_round, X_train, X_test, y_train, y_test
            )
            mlflow.log_metric("training_time", train_time)

            # evaluate model
            loss, acc = evaluate_model(model, X_test, y_test)
            mlflow.log_metrics({"loss": loss, "accuracy": acc})

            return loss
    except:
        return None 

In [6]:
%%time 

import optuna

study = optuna.create_study(direction="minimize", study_name="test")
study.optimize(objective, n_trials=8, n_jobs=-1)

[I 2020-10-08 10:13:14,511] A new study created in memory with name: test
[W 2020-10-08 10:13:16,585] Trial 5 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-08 10:13:16,704] Trial 6 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-08 10:13:16,748] Trial 4 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-08 10:13:16,900] Trial 7 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-08 10:13:16,918] Trial 0 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-08 10:13:16,933] Trial 1 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-08 10:13:16,937] Trial 3 fa

In [7]:
%%time 

import joblib
import optuna
import dask_optuna
from dask.distributed import Client

c = Client()
print(c)
print(c.dashboard_link)

sampler = optuna.samplers.TPESampler()
storage = dask_optuna.DaskStorage()
study = optuna.create_study(
    direction="minimize", study_name="test-dask", sampler=sampler, storage=storage
)
with joblib.parallel_backend("dask"):
    study.optimize(objective, n_trials=2000, n_jobs=-1)

  and should_run_async(code)
<Client: 'tcp://127.0.0.1:54821' processes=4 threads=8, memory=34.36 GB>
http://127.0.0.1:8787/status
[I 2020-10-08 10:14:27,685] A new study created in memory with name: test-dask
CPU times: user 4min 7s, sys: 6.99 s, total: 4min 14s
Wall time: 5min 2s


In [8]:
study.best_params

  and should_run_async(code)


{'num_boost_round': 85,
 'boosting': 'gbdt',
 'num_iterations': 100,
 'num_leaves': 38,
 'learning_rate': 0.09998570960648047}

In [9]:
study.best_value

0.017809971521718523

In [10]:
optuna.visualization.plot_optimization_history(study)

In [11]:
optuna.visualization.plot_param_importances(study)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [12]:
optuna.visualization.plot_parallel_coordinate(study)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [13]:
df = study.trials_dataframe()
df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_boosting,params_learning_rate,params_num_boost_round,params_num_iterations,params_num_leaves,system_attrs_fail_reason,state
0,0,,2020-10-08 10:14:31.458988,2020-10-08 10:14:32.707171,0 days 00:00:01.248183,goss,0.052488,18.0,10.0,16.0,"Trial 0 failed, because the returned value fro...",FAIL
1,1,0.031207,2020-10-08 10:14:31.464492,2020-10-08 10:14:38.519388,0 days 00:00:07.054896,gbdt,0.081303,15.0,97.0,29.0,,COMPLETE
2,2,,2020-10-08 10:14:31.486953,2020-10-08 10:14:32.732484,0 days 00:00:01.245531,dart,0.012765,81.0,29.0,34.0,"Trial 2 failed, because the returned value fro...",FAIL
3,3,0.12013,2020-10-08 10:14:31.487577,2020-10-08 10:14:36.504030,0 days 00:00:05.016453,dart,0.06977,51.0,36.0,44.0,,COMPLETE
4,4,,2020-10-08 10:14:31.498119,2020-10-08 10:14:32.684220,0 days 00:00:01.186101,goss,0.001022,36.0,92.0,58.0,"Trial 4 failed, because the returned value fro...",FAIL
