# Install Dependencies

In [None]:
!pip install mlflow --quiet

[K     |████████████████████████████████| 17.8 MB 5.3 MB/s 
[K     |████████████████████████████████| 62 kB 656 kB/s 
[K     |████████████████████████████████| 209 kB 53.8 MB/s 
[K     |████████████████████████████████| 79 kB 7.4 MB/s 
[K     |████████████████████████████████| 146 kB 40.3 MB/s 
[K     |████████████████████████████████| 596 kB 48.3 MB/s 
[K     |████████████████████████████████| 181 kB 46.5 MB/s 
[K     |████████████████████████████████| 54 kB 2.4 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |████████████████████████████████| 78 kB 2.4 MB/s 
[?25h  Building wheel for databricks-cli (setup.py) ... [?25l[?25hdone


# Load Dependencies

In [None]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

In [None]:
import mlflow
import mlflow.sklearn

In [None]:
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# Utilities for Data and Metrics

In [None]:
def prepare_data():
    warnings.filterwarnings("ignore")
    np.random.seed(42)

    # Read the wine-quality csv file from the URL
    csv_url =\
        'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    try:
        data = pd.read_csv(csv_url, sep=';')
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data, test_size=0.25, random_state=42)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]
    return train_x, test_x, train_y, test_y


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# Load Dataset

In [None]:
X_train, X_test, y_train, y_test = prepare_data()

data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}

data['X_train'].head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
582,11.7,0.49,0.49,2.2,0.083,5.0,15.0,1.0,3.19,0.43,9.2
626,8.8,0.6,0.29,2.2,0.098,5.0,15.0,0.9988,3.36,0.49,9.1
1030,7.1,0.59,0.0,2.1,0.091,9.0,14.0,0.99488,3.42,0.55,11.5
620,8.3,0.54,0.24,3.4,0.076,16.0,112.0,0.9976,3.27,0.61,9.4
490,9.3,0.775,0.27,2.8,0.078,24.0,56.0,0.9984,3.31,0.67,10.6


In [None]:
data['y_train'].head()

Unnamed: 0,quality
582,5
626,5
1030,7
620,5
490,6


# Utilities for Modeling and Tracking Experiments

In [None]:
def train_elasticnet(data, alpha=0.5, l1_ratio=0.5):

    # Train and track experiment
    with mlflow.start_run():
        # Execute ElasticNet
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(data['X_train'], data['y_train'])

        # Evaluate Metrics
        predicted_qualities = lr.predict(data['X_test'])
        (rmse, mae, r2) = eval_metrics(data['y_test'], predicted_qualities)

        # Print out metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('Model', 'ElasticNet')  
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(lr, "model")

In [None]:
def train_random_forest(data, n_trees=100, max_depth=None):

    # Train and track experiment   
    with mlflow.start_run():
        # Execute RF
        rf = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth, random_state=42)
        rf.fit(data['X_train'], data['y_train'])

        # Evaluate Metrics
        predicted_qualities = rf.predict(data['X_test'])
        (rmse, mae, r2) = eval_metrics(data['y_test'], predicted_qualities)

        # Print out metrics
        print("Random Forest model (n_estimators={}, max_depth={}):".format(n_trees, max_depth))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('Model', 'Random Forest')  
        mlflow.log_param("n_estimators", n_trees)
        mlflow.log_param("max_depth", max_depth)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(rf, "model")

## Experiments

In [None]:
train_elasticnet(data)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 0.7436470916334205
  MAE: 0.6042761768399746
  R2: 0.10601910075094545


In [None]:
train_elasticnet(data, alpha=0.2, l1_ratio=0.2)

Elasticnet model (alpha=0.200000, l1_ratio=0.200000):
  RMSE: 0.6926302996485334
  MAE: 0.5529215843503876
  R2: 0.22447206750598891


In [None]:
train_elasticnet(data, alpha=0.1, l1_ratio=0.1)

Elasticnet model (alpha=0.100000, l1_ratio=0.100000):
  RMSE: 0.6720681915647143
  MAE: 0.5357524536395577
  R2: 0.2698347597060824


In [None]:
train_elasticnet(data, alpha=0.7, l1_ratio=0.3)

Elasticnet model (alpha=0.700000, l1_ratio=0.300000):
  RMSE: 0.7400317080162558
  MAE: 0.5999795735284686
  R2: 0.11469049134515674


In [None]:
train_random_forest(data)

Random Forest model (n_estimators=100, max_depth=None):
  RMSE: 0.554140325188485
  MAE: 0.42284999999999995
  R2: 0.5035974741096237


In [None]:
train_random_forest(data, n_trees=500, max_depth=None)

Random Forest model (n_estimators=500, max_depth=None):
  RMSE: 0.5572180004271219
  MAE: 0.4225
  R2: 0.4980681586259156


In [None]:
train_random_forest(data, n_trees=1000, max_depth=None)

Random Forest model (n_estimators=1000, max_depth=None):
  RMSE: 0.5590675227555255
  MAE: 0.42322000000000004
  R2: 0.49473059661530694


In [None]:
train_random_forest(data, n_trees=500, max_depth=5)

Random Forest model (n_estimators=500, max_depth=5):
  RMSE: 0.6100863372072169
  MAE: 0.489118185020059
  R2: 0.3983040746096802


These models will create files in a folder named as mlruns. Which will be used by MLFLow for the UI.

## MLFLow UI

Run `mlflow ui` in terminal
<br>and view it at http://localhost:5000 in case running locally from jupyter. 
<br> In case of running in colab we will have to use ngrok tunnel.

In [None]:
# !mlflow ui --port 5000

In [None]:
!pip install pyngrok --quiet

[?25l[K     |▍                               | 10 kB 17.4 MB/s eta 0:00:01[K     |▉                               | 20 kB 19.8 MB/s eta 0:00:01[K     |█▎                              | 30 kB 11.8 MB/s eta 0:00:01[K     |█▊                              | 40 kB 9.7 MB/s eta 0:00:01[K     |██▏                             | 51 kB 4.0 MB/s eta 0:00:01[K     |██▋                             | 61 kB 4.8 MB/s eta 0:00:01[K     |███                             | 71 kB 5.4 MB/s eta 0:00:01[K     |███▌                            | 81 kB 4.2 MB/s eta 0:00:01[K     |████                            | 92 kB 4.7 MB/s eta 0:00:01[K     |████▍                           | 102 kB 5.2 MB/s eta 0:00:01[K     |████▉                           | 112 kB 5.2 MB/s eta 0:00:01[K     |█████▎                          | 122 kB 5.2 MB/s eta 0:00:01[K     |█████▊                          | 133 kB 5.2 MB/s eta 0:00:01[K     |██████▏                         | 143 kB 5.2 MB/s eta 0:00:01[K  

In [None]:
from pyngrok import ngrok
from getpass import getpass

# Terminate open tunnels if exist
ngrok.kill()

# Setting the authtoken (optional)
# Get your authtoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = getpass('Enter the ngrok authtoken: ')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

Enter the ngrok authtoken: ··········
MLflow Tracking UI: https://6aca-34-73-28-52.ngrok.io


In [None]:
!mlflow ui --port 5000

[2022-06-09 11:13:10 +0000] [295] [INFO] Starting gunicorn 20.1.0
[2022-06-09 11:13:10 +0000] [295] [INFO] Listening at: http://127.0.0.1:5000 (295)
[2022-06-09 11:13:10 +0000] [295] [INFO] Using worker: sync
[2022-06-09 11:13:10 +0000] [298] [INFO] Booting worker with pid: 298
