## Experiment tracking using Neptune and MLflow

In [None]:
#import base libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from preprocessing_functions import log_transform

In [None]:
# importing model metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
# import serializer
import joblib

In [None]:
# import neptune libraries and sklearn integration
import neptune.new as neptune
import neptune.new.integrations.sklearn as npt_skl

In [None]:
# import mlflow and libraries
import mlflow
import mlflow.sklearn

### Load models and variables

In [None]:
# load models
pipeline_lr = joblib.load("../models/lr_model.joblib")
pipeline_rf = joblib.load("../models/rf_model.joblib")

In [None]:
# load splitted data
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

In [None]:
#Split the data into X and y
X_train = df_train.drop(columns=['Radiation'])
y_train = df_train['Radiation']

X_test = df_test.drop(columns=['Radiation'])
y_test = df_test['Radiation']

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

### Experiment tracking with Neptune

In [None]:
run = neptune.init(
    project="max-lutz/MLOps-course",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI4MGE3NGExZS1kMWQ2LTQ5MTMtODBkMy05ZjI5MmUwZWY0MWEifQ==",
    name='solar-irradiation'
)  # your credentials

#### Track files, models and folders

In [None]:
# tracking files
#run["cleaned_datasets"].track_files("../data/cleaned/")
#run["raw_datasets"].track_files("../data/raw/")

# tracking folders
run["notebooks"].track_files("../notebooks/")

#### train and test models while tracking the experiment

In [None]:
run['lr_summary'] = npt_skl.create_regressor_summary(pipeline_lr, X_train, X_test, y_train, y_test);
run['rf_summary'] = npt_skl.create_regressor_summary(pipeline_rf, X_train, X_test, y_train, y_test);

#### stop experiment

In [None]:
run.stop()

### Experiment tracking with MLflow

#### Evaluation of Linear Regressor and Random Forest

In [None]:
# model metrics for linear regression
y_test_lr_pred = pipeline_lr.predict(X_test)
mse_test_lr = mean_squared_error(y_test, y_test_lr_pred, squared=True)
mae_test_lr = mean_absolute_error(y_test, y_test_lr_pred)
r2_test_lr = r2_score(y_test, y_test_lr_pred)

In [None]:
# model metrics for Random Forest Regressor
y_test_rf_pred = pipeline_rf.predict(X_test)
mse_test_rf = mean_squared_error(y_test, y_test_rf_pred, squared=True)
mae_test_rf = mean_absolute_error(y_test, y_test_rf_pred)
r2_test_rf = r2_score(y_test, y_test_rf_pred)

#### manually log experiment

<span style="color:red"> **Att:** before running commands below, run on the bash (in the same folder of this file , i.e. notebooks/)the following command:   
    $ mlflow server --backend-store-uri sqlite:///mlflow.db  --default-artifact-root ./artifacts --host 0.0.0.0
</span>

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
experiment = mlflow.get_experiment('0')

In [None]:
print("Name of experiment: {}".format(experiment.name))
print("Location of Artifact: {}".format(experiment.artifact_location))
print("Life cycle phase: {}".format(experiment.lifecycle_stage))
print("Experiment_ID: {}".format(experiment.experiment_id))

In [None]:
# start experiment tracking
run = mlflow.start_run(run_name="solar_irradiation")
run_id = run.info.run_id

In [None]:
run_id

In [None]:
# log mlflow attributes for mlflow UI

# log metrics for linear regression
mlflow.log_metric('mse_lr', mse_test_lr)
mlflow.log_metric('mae_lr', mae_test_lr)
mlflow.log_metric('r2_lr', r2_test_lr)

# log metrics for Random Forest regressor
mlflow.log_metric('mse_rf', mse_test_rf)
mlflow.log_metric('mae_rf', mae_test_rf)
mlflow.log_metric('r2_rf', r2_test_rf)

In [None]:
mlflow.sklearn.get_default_conda_env()

In [None]:
type(pipeline_lr)

In [None]:
# log models experiments
path = 'default'
mlflow.sklearn.log_model(pipeline_lr, path, registered_model_name='linear-reg-model')
mlflow.sklearn.log_model(pipeline_rf, path, registered_model_name='random-forest-reg-model')


In [None]:
# log artifacts (output files)

# plots
mlflow.log_artifact("ml_flow_plots/lr_pred_and_fit.jpg", artifact_path="plots")
mlflow.log_artifact("ml_flow_plots/rf_pred_and_fit.jpg", artifact_path="plots")
mlflow.log_artifact("ml_flow_plots/lr_jointplot.jpg", artifact_path="plots")
mlflow.log_artifact("ml_flow_plots/rf_jointplot.jpg", artifact_path="plots")

In [None]:
# end experiment
mlflow.end_run()

#### Auto log experiment

In [None]:
# enable auto logging --> log_models=True means that "trained models are logged as MLflow model artifacts"
mlflow.sklearn.autolog(log_models=True)

In [None]:
with mlflow.start_run(experiment_id=experiment.experiment_id, run_name="auto_log_models") as run:
    pipeline_lr.fit(X_train, y_train)
    y_test_lr_autologPred = pipeline_lr.predict(X_test)
    pipeline_rf.fit(X_train, y_train)
    y_test_rf_autologPred = pipeline_rf.predict(X_test)

In [None]:
# end tracking
mlflow.end_run()