### MLFlow and Optuna
In this Notebook we show how to use **Optuna** for hyper-parameters optimization and how to track the results of experiments inside **MLFlow**

* env used: generalml_p37_gpu_v1
* works also with CPU
* requires: **pip install mlflow**

In [1]:
import os

import pandas as pd
import numpy as np

import optuna
import mlflow

import xgboost as xgb 

# using a myconfig.py file I avoid to show passwords in the NB
from myconfig import config

In [2]:
# used ony as an example
from sklearn.datasets import load_breast_cancer

In [3]:
# prepare the dataset

# as dataset for the example of ML training we're suing Wisconsin Breast Cancer dataset
# data can be loaded from sklearn
cancer = load_breast_cancer()

# load in Pandas DataFrame
df = pd.DataFrame(
    np.c_[cancer["data"], cancer["target"]],
    columns=np.append(cancer["feature_names"], ["target"]),
)

TARGET = "target"

# let's choose only some of the column
FEATURES = ["mean radius", "mean concavity", "mean symmetry", "mean fractal dimension"]

X = df[FEATURES].values
y = df[TARGET].values

dtrain = xgb.DMatrix(X, label=y)

In [4]:
#
# Here we load the configuration to connect to MLFlow Tracking Server
#

# also the tracking server uri is in the myconfig.py file
TRACK_SERVER_URI = config["TRACK_SERVER_URI"]

# the key for succesfull auth is to set these two variables
# see documentation in https://www.mlflow.org/docs/latest/tracking.html
os.environ['MLFLOW_TRACKING_USERNAME'] = config["MLFLOW_TRACKING_USERNAME"]
os.environ['MLFLOW_TRACKING_PASSWORD'] = config['MLFLOW_TRACKING_PASSWORD']

In [12]:
# initialize and set experiment ID
mlflow.set_tracking_uri(TRACK_SERVER_URI)

EXP_NAME = "exp-xgb-9"

exp_id = mlflow.set_experiment(EXP_NAME)

2022/02/17 11:05:17 INFO mlflow.tracking.fluent: Experiment with name 'exp-xgb-9' does not exist. Creating a new experiment.


In [13]:
#
# Here we define what we do using Optuna
#
def objective(trial):
    with mlflow.start_run():
        
        # tuning on max_depth, n_estimators for the example
        param = {
            "verbosity": 0,
            "objective": "binary:logistic",
            "eval_metric": "error",
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        }
        
        if param["booster"] == "gbtree" or param["booster"] == "dart":
            param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        if param["booster"] == "dart":
            param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
        
        # logging to MLFlow
        mlflow.log_params(param)
        
        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-error")
        
        history = xgb.cv(param, dtrain, nfold=5, num_boost_round=80, callbacks=[pruning_callback])
        
        mean_error = history["test-error-mean"].values[-1]
        
        # compute accuracy
        acc = round(1. - mean_error, 4)
        
         # logging to MLFlow
        mlflow.log_metric("acc", acc)
        
        mlflow.end_run()
    
    return acc

In [14]:
#
# execute the study
#
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)

study = optuna.create_study(study_name=EXP_NAME, pruner=pruner, direction="maximize")

study.optimize(objective, n_trials=40)

[32m[I 2022-02-17 11:05:19,260][0m A new study created in memory with name: exp-xgb-9[0m
[32m[I 2022-02-17 11:05:44,681][0m Trial 0 finished with value: 0.8542 and parameters: {'booster': 'gblinear', 'lambda': 0.5249055144877295, 'alpha': 0.0016882829515764979}. Best is trial 0 with value: 0.8542.[0m
[32m[I 2022-02-17 11:06:10,007][0m Trial 1 finished with value: 0.9122 and parameters: {'booster': 'gblinear', 'lambda': 3.521681514362404e-08, 'alpha': 0.00010842315928163968}. Best is trial 1 with value: 0.9122.[0m
[32m[I 2022-02-17 11:06:35,380][0m Trial 2 finished with value: 0.9139 and parameters: {'booster': 'gblinear', 'lambda': 8.773774032777868e-07, 'alpha': 0.0001450515050880122}. Best is trial 2 with value: 0.9139.[0m
[32m[I 2022-02-17 11:07:22,715][0m Trial 3 finished with value: 0.9034 and parameters: {'booster': 'gbtree', 'lambda': 0.02479229370671492, 'alpha': 0.7966635544456746, 'max_depth': 3, 'eta': 9.227946692432297e-06, 'gamma': 0.00014528182275889048, 'gr

In [15]:
# analyze result
trial = study.best_trial

print("Best trial:", trial)

Best trial: FrozenTrial(number=21, values=[0.9175], datetime_start=datetime.datetime(2022, 2, 17, 11, 14, 30, 405028), datetime_complete=datetime.datetime(2022, 2, 17, 11, 14, 55, 765333), params={'booster': 'gblinear', 'lambda': 0.00016788146789630906, 'alpha': 0.00034545144031176235}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear')), 'lambda': LogUniformDistribution(high=1.0, low=1e-08), 'alpha': LogUniformDistribution(high=1.0, low=1e-08)}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.1880608, 1: 0.15107900000000002, 2: 0.1264246, 3: 0.1405374, 4: 0.12291579999999999, 5: 0.1193914, 6: 0.11763699999999999, 7: 0.1123584, 8: 0.11589819999999999, 9: 0.11588259999999999, 10: 0.1194224, 11: 0.11412819999999999, 12: 0.117668, 13: 0.11412819999999999, 14: 0.11415919999999999, 15: 0.1123894, 16: 0.1123894, 17: 0.10535619999999998, 18: 0.10711059999999999, 19: 0.10535619999999998, 20: 0.10535619999999998, 21: 0.10712619999999999, 22: 0.10712619999