In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

Workspace.create(name=&#39;AzureML&#39;, subscription_id=&#39;6560575d-fa06-4e7d-95fb-f962e74efd7a&#39;, resource_group=&#39;cody-eastus-rg&#39;)

In [2]:
import mlflow

mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment("lightgbm-mlflow-optuna-aml-coiled")

INFO: &#39;lightgbm-mlflow-optuna-aml-coiled&#39; does not exist. Creating a new experiment


In [3]:
# imports
import os
import time
import mlflow
import argparse

import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

  and should_run_async(code)


In [4]:
# define functions
def preprocess_data(df):
    X = df.drop(["species"], axis=1)
    y = df["species"]

    enc = LabelEncoder()
    y = enc.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, enc


def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):
    t1 = time.time()
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[test_data],
        valid_names=["test"],
    )
    t2 = time.time()

    return model, t2 - t1


def evaluate_model(model, X_test, y_test):
    y_proba = model.predict(X_test)
    y_pred = y_proba.argmax(axis=1)
    loss = log_loss(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)

    return loss, acc

  and should_run_async(code)


In [5]:
# define an objective for optuna to optimize 
def objective(trial):
    try:
        # start mlflow run
        with mlflow.start_run():
            # enable autologging
            mlflow.lightgbm.autolog()

            # generate parameters 
            num_boost_round = trial.suggest_int("num_boost_round", 1, 100)
            params = {
                "objective": "multiclass",
                "num_class": 3,
                "boosting":  trial.suggest_categorical(
                    "boosting", ["gbdt", "dart", "goss"]
                ),
                "num_iterations": trial.suggest_int("num_iterations", 10, 100),
                "num_leaves": trial.suggest_int("num_leaves", 15, 63),
                #"num_threads": trial.suggest_categorical("num_threads", [1, 2, 4]),
                "learning_rate": trial.suggest_loguniform("learning_rate", 10e-5, .1),
                "metric": "multi_logloss",
                #"seed": trial.suggest_categorical("seed", [1, 3, 5, 7, 11, 13, 42]),
                "verbose": 0,
            }
            mlflow.log_param("num_boost_round", num_boost_round)
            mlflow.log_params(params)

            # read in dataset 
            df = pd.read_csv("../data/iris/iris.csv")

            # preprocess data
            X_train, X_test, y_train, y_test, enc = preprocess_data(df)

            # train model
            model, train_time = train_model(
                params, num_boost_round, X_train, X_test, y_train, y_test
            )
            mlflow.log_metric("training_time", train_time)

            # evaluate model
            loss, acc = evaluate_model(model, X_test, y_test)
            mlflow.log_metrics({"loss": loss, "accuracy": acc})

            return loss
    except:
        return None 

In [6]:
%%time 

import optuna

study = optuna.create_study(direction="minimize", study_name="test")
study.optimize(objective, n_trials=16, n_jobs=-1)

[I 2020-10-05 16:22:11,036] A new study created in memory with name: test
[W 2020-10-05 16:22:13,091] Trial 5 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-05 16:22:13,096] Trial 8 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-05 16:22:13,101] Trial 9 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-05 16:22:13,105] Trial 10 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-05 16:22:13,110] Trial 11 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-05 16:22:13,112] Trial 2 failed, because the returned value from the objective function cannot be cast to float. Returned value is: None
[W 2020-10-05 16:22:13,115] Trial 12

In [7]:
%%time 

import joblib
import optuna
import dask_optuna

from dask.distributed import Client

c = Client()
print(c)
print(c.dashboard_link)

sampler = optuna.samplers.TPESampler()
storage = dask_optuna.DaskStorage()
study = optuna.create_study(
    direction="minimize", study_name="test-dask", sampler=sampler, storage=storage
)
with joblib.parallel_backend("dask"):
    study.optimize(objective, n_trials=2000, n_jobs=-1)

  and should_run_async(code)
&lt;Client: &#39;tcp://127.0.0.1:53262&#39; processes=4 threads=8, memory=34.36 GB&gt;
http://127.0.0.1:8787/status
[I 2020-10-05 16:23:14,467] A new study created in memory with name: test-dask
CPU times: user 3min 13s, sys: 4.16 s, total: 3min 17s
Wall time: 3min 30s


In [8]:
study.best_params

  and should_run_async(code)


{&#39;num_boost_round&#39;: 98,
 &#39;boosting&#39;: &#39;gbdt&#39;,
 &#39;num_iterations&#39;: 100,
 &#39;num_leaves&#39;: 61,
 &#39;learning_rate&#39;: 0.09962117068042924}

In [9]:
study.best_value

0.01880866644078725

In [10]:
optuna.visualization.plot_optimization_history(study)

In [11]:
optuna.visualization.plot_param_importances(study)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [12]:
optuna.visualization.plot_parallel_coordinate(study)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [13]:
df = study.trials_dataframe()
df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_boosting,params_learning_rate,params_num_boost_round,params_num_iterations,params_num_leaves,system_attrs_fail_reason,state
0,0,,2020-10-05 16:23:17.249697,2020-10-05 16:23:17.964861,0 days 00:00:00.715164,dart,0.000807,8.0,64.0,36.0,"Trial 0 failed, because the returned value fro...",FAIL
1,1,0.992057,2020-10-05 16:23:17.253145,2020-10-05 16:23:21.767566,0 days 00:00:04.514421,gbdt,0.000974,95.0,67.0,36.0,,COMPLETE
2,2,,2020-10-05 16:23:17.281393,2020-10-05 16:23:18.014320,0 days 00:00:00.732927,gbdt,0.038392,71.0,78.0,52.0,"Trial 2 failed, because the returned value fro...",FAIL
3,3,0.644618,2020-10-05 16:23:17.281561,2020-10-05 16:23:21.374396,0 days 00:00:04.092835,gbdt,0.005933,99.0,61.0,31.0,,COMPLETE
4,4,0.589192,2020-10-05 16:23:17.290011,2020-10-05 16:23:20.724239,0 days 00:00:03.434228,dart,0.014556,83.0,39.0,41.0,,COMPLETE
