In [8]:
RANDOM_STATE = 123

MLFLOW_EXPERIMENT_NAME = 'sklearn-elasticnet-diabetes'

# defaults to local dir ./mlruns
MLFLOW_TRACKING_URI = 'mysql+pymysql://mlflowusr:mlflowusrpwd@localhost:3306/mlflow'

CV_N_FOLD = 5
SELECTION_METRIC = 'r2'
TEST_SIZE = 0.3

In [6]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import sklearn.datasets
import sklearn.metrics as metrics
import sklearn.model_selection 
from sklearn.linear_model import ElasticNet

In [16]:
dataset = sklearn.datasets.load_diabetes()
X = pd.DataFrame(data=dataset['data'], columns=dataset['feature_names'])
y = pd.Series(data=dataset['target'])

mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name=MLFLOW_EXPERIMENT_NAME)
with mlflow.start_run():
    mlflow.log_metric('total_record_count', X.shape[0])
    
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, 
                                                                                random_state=RANDOM_STATE, 
                                                                                shuffle=True, 
                                                                                test_size=TEST_SIZE)
    
    mlflow.log_metric('train_record_count', X_train.shape[0])
    mlflow.log_metric('test_record_count', X_test.shape[0])
    mlflow.log_param('test_size_ratio', TEST_SIZE)

    cv_splitter = sklearn.model_selection.KFold(n_splits=CV_N_FOLD, 
                                                shuffle=True, 
                                                random_state=RANDOM_STATE)

    mlflow.log_param('cv', cv_splitter.__class__.__name__)
    mlflow.log_param('cv_n_fold', CV_N_FOLD)
    
    estimator = sklearn.model_selection.GridSearchCV(estimator=ElasticNet(random_state=RANDOM_STATE),
                                                     param_grid={
                                                         'alpha': np.logspace(-3,3,19,base=10),
                                                         'l1_ratio': np.linspace(0.3,0.7,17)
                                                     },
                                                     scoring=['neg_mean_squared_error', SELECTION_METRIC],
                                                     cv=cv_splitter, 
                                                     refit=SELECTION_METRIC,
                                                     return_train_score=True)
    
    mlflow.log_param('hyperparameter_selection', estimator.__class__.__name__)
    mlflow.log_param('selection_metric', SELECTION_METRIC)
    
    estimator.fit(X_train, y_train)

    best_params = estimator.best_params_
    
    # Parameters and metrics returned as numpy.float64 objects. 
    # We are forced to cast objects of type numpy.float64 into built-in float type since the pymysql driver does
    # not provide a encoder for objects of the former type.
    mlflow.log_param('alpha', float(best_params['alpha']))
    mlflow.log_param('l1_ratio', float(best_params['l1_ratio']))

    cv_results = estimator.cv_results_
    best_model_index = estimator.best_index_
    best_model = estimator.best_estimator_

    y_test_pred = best_model.predict(X_test)
    
    mlflow.log_metric('mean_train_mse', float(-1.0*cv_results['mean_train_neg_mean_squared_error'][best_model_index]))
    mlflow.log_metric('std_train_mse', float(-1.0*cv_results['std_train_neg_mean_squared_error'][best_model_index]))
    
    mlflow.log_metric('mean_val_mse', float(-1.0*cv_results['mean_test_neg_mean_squared_error'][best_model_index]))
    mlflow.log_metric('std_val_mse', float(-1.0*cv_results['std_test_neg_mean_squared_error'][best_model_index]))
    
    mlflow.log_metric('test_mse', float(metrics.mean_squared_error(y_test, y_test_pred)))
    
    mlflow.log_metric('mean_train_r2', float(cv_results['mean_train_r2'][best_model_index]))
    mlflow.log_metric('std_train_r2', float(cv_results['std_train_r2'][best_model_index]))
    
    mlflow.log_metric('mean_val_r2', float(cv_results['mean_test_r2'][best_model_index]))
    mlflow.log_metric('std_val_r2', float(cv_results['std_test_r2'][best_model_index]))
    
    mlflow.log_metric('test_r2', float(metrics.r2_score(y_test, y_test_pred)))

    # NB: Models do not have to be registered in the MLflow Model Registry after each run
    # We can choose later which of the models produced by many runs we want to include in the Model Registry
    # Better split logging the model and adding it to the Model Registry
    mlflow.sklearn.log_model(sk_model=estimator.best_estimator_,
                             artifact_path='sklearn-model')

INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl MySQLImpl.
INFO  [alembic.runtime.migration] Will assume non-tra