### MLFlow and Optuna
In this Notebook we show how to use **Optuna** for hyper-parameters optimization and how to track the results of experiments inside **MLFlow**

* env used: generalml_p37_gpu_v1
* works also with CPU
* requires: **pip install mlflow**

In [1]:
import os

import pandas as pd
import numpy as np

import optuna
import mlflow

# using a myconfig.py file I avoid to show passwords in the NB
from myconfig import config

In [3]:
# used ony as an example
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_validate

In [4]:
# as dataset for the example of ML training we're suing Wisconsin Breast Cancer dataset
# data can be loaded from sklearn
cancer = load_breast_cancer()

# load in Pandas DataFrame
df = pd.DataFrame(
    np.c_[cancer["data"], cancer["target"]],
    columns=np.append(cancer["feature_names"], ["target"]),
)

In [5]:
TARGET = "target"

# let's choose only some of the column
FEATURES = ["mean radius", "mean concavity", "mean symmetry", "mean fractal dimension"]

X = df[FEATURES].values
y = df[TARGET].values

In [6]:
#
# Here we load the configuration to connect to MLFlow Tracking Server
#

# also the tracking server uri is in the myconfig.py file
TRACK_SERVER_URI = config["TRACK_SERVER_URI"]

# the key for succesfull auth is to set these two variables
# see documentation in https://www.mlflow.org/docs/latest/tracking.html
os.environ['MLFLOW_TRACKING_USERNAME'] = config["MLFLOW_TRACKING_USERNAME"]
os.environ['MLFLOW_TRACKING_PASSWORD'] = config['MLFLOW_TRACKING_PASSWORD']

In [7]:
# initialize and set experiment ID
mlflow.set_tracking_uri(TRACK_SERVER_URI)

EXP_NAME = "exp31"

exp_id = mlflow.set_experiment(EXP_NAME)

2022/02/16 07:37:24 INFO mlflow.tracking.fluent: Experiment with name 'exp31' does not exist. Creating a new experiment.


In [8]:
#
# Here we define what we do using Optuna
#
def objective(trial):
    with mlflow.start_run():
        
        # tuning on max_depth, n_estimators for the example
        params = {
            "max_depth" : trial.suggest_int("max_depth", 2, 10),
            "n_estimators" : trial.suggest_int("n_estimators", 10, 100, step=10),
            "max_leaf_nodes" : trial.suggest_int("max_leaf_nodes", 2, 10)
        }
        
        # logging to MLFlow
        mlflow.log_params(params)
        
        # as a test using RF classifier
        clf = RandomForestClassifier(random_state=0, **params)
        
        # using sklearn Kfold CV, accuracy as score metric
        scores = cross_validate(clf, X, y, cv=5, scoring = "accuracy", 
                                return_train_score=True)
        
        # we do an avg of the test set scores
        acc = round(np.mean(scores['test_score']), 3)
        
         # logging to MLFlow
        mlflow.log_metric("acc", acc)
        
        mlflow.end_run()
    
    return acc

In [9]:
#
# execute the study
#
study = optuna.create_study(study_name="mlflow-optuna-1", direction="maximize")

study.optimize(objective, n_trials=50)

[32m[I 2022-02-16 07:37:34,898][0m A new study created in memory with name: mlflow-optuna-1[0m
[32m[I 2022-02-16 07:37:35,973][0m Trial 0 finished with value: 0.907 and parameters: {'max_depth': 9, 'n_estimators': 40, 'max_leaf_nodes': 3}. Best is trial 0 with value: 0.907.[0m
[32m[I 2022-02-16 07:37:37,227][0m Trial 1 finished with value: 0.921 and parameters: {'max_depth': 9, 'n_estimators': 80, 'max_leaf_nodes': 9}. Best is trial 1 with value: 0.921.[0m
[32m[I 2022-02-16 07:37:37,696][0m Trial 2 finished with value: 0.877 and parameters: {'max_depth': 2, 'n_estimators': 20, 'max_leaf_nodes': 2}. Best is trial 1 with value: 0.921.[0m
[32m[I 2022-02-16 07:37:38,378][0m Trial 3 finished with value: 0.919 and parameters: {'max_depth': 7, 'n_estimators': 40, 'max_leaf_nodes': 8}. Best is trial 1 with value: 0.921.[0m
[32m[I 2022-02-16 07:37:38,621][0m Trial 4 finished with value: 0.877 and parameters: {'max_depth': 7, 'n_estimators': 10, 'max_leaf_nodes': 2}. Best is tri

In [10]:
# analyze result
trial = study.best_trial

print("Best trial:", trial)

Best trial: FrozenTrial(number=13, values=[0.926], datetime_start=datetime.datetime(2022, 2, 16, 7, 37, 46, 324492), datetime_complete=datetime.datetime(2022, 2, 16, 7, 37, 47, 800875), params={'max_depth': 5, 'n_estimators': 100, 'max_leaf_nodes': 10}, distributions={'max_depth': IntUniformDistribution(high=10, low=2, step=1), 'n_estimators': IntUniformDistribution(high=100, low=10, step=10), 'max_leaf_nodes': IntUniformDistribution(high=10, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=13, state=TrialState.COMPLETE, value=None)
