<a href="https://colab.research.google.com/github/paudan/mlflow_workshop/blob/main/LC1_MLFlow_Tracking_Regression_SwissHousing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<center><a target="_blank" href="https://www.sds2022.ch/"><img src="https://drive.google.com/uc?id=1S7k7kTXs9qIylw3C7LA9rHkLycjlY8te" width="500" style="background:none; border:none; box-shadow:none;" /></a> </center>

<center><a target="_blank" href="http://www.sit.academy"><img src="https://drive.google.com/uc?id=1x9_jQgLhozCSWDSaOdVxKmxOEAe_OLgV" width="250" style="background:none; border:none; box-shadow:none;" /></a> </center>

_____

<center> <h1> Live Coding  </h1> </center>

<p style="margin-bottom:1cm;"></p>

_____

<center>SIT Academy, 2022</center>



# MLFlow Workshop - Sequence 1 - Tracking API

MLflow Tracking is an API and UI for logging:
- parameters, 
- code versions, 
- metrics, and 
- artifacts 

when running your machine learning code and for later visualizing the results. You can use MLflow Tracking in any environment (for example, a standalone script or a notebook) to log results to local files or to a server, then compare multiple runs. Teams can also use it to compare results from different users.


# The Use-Case: 

Predict the price of the property based on different features from the `swiss_housing_data.csv` such as: 

"type", "room_num", "floor", "area_m2", "floors_num", "year_built", "last_refurbishment", "city", "lat", "lon", "canton"

# Install Dependencies

In [1]:
!pip install mlflow --quiet

# Load Dependencies

In [2]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [4]:
import mlflow
import mlflow.sklearn

In [5]:
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# Data mapping

In [6]:
!pip install pyyaml==5.4.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

  defaults = yaml.load(f)


In [8]:
orig_url = "https://drive.google.com/file/d/1eNTyJc4jXJMkLPXW0eY6LL7_P9YN1GWO/view"
file_id = orig_url.split('/')[-2]
data_path='https://drive.google.com/uc?export=download&id=' + file_id
swisshousing_data = pd.read_csv(data_path)
swisshousing_data.head()

Unnamed: 0.1,Unnamed: 0,price,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,city,zip,lat,lon,canton
0,0,1235000,Apartment,2.5,GF,138.0,4.0,2015.0,2018.0,Aeugst am Albis,8914,47.2822,8.48965,Kanton Zürich
1,1,650000,Apartment,4.5,1,121.0,1.0,1987.0,2021.0,Agno,6982,46.0005,8.9028,Ticino
2,2,1062900,Apartment,2.5,1,63.0,1.0,,,Bissone,6816,45.951,8.9655,Ticino
3,3,1125500,Attic flat,2.5,3,63.0,1.0,,,Bissone,6816,45.951,8.9655,Ticino
4,4,2180700,Apartment,3.5,2,125.0,1.0,,,Bissone,6816,45.951,8.9655,Ticino


In [9]:
swisshousing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          2191 non-null   int64  
 1   price               2191 non-null   object 
 2   type                2191 non-null   object 
 3   room_num            2077 non-null   float64
 4   floor               2191 non-null   object 
 5   area_m2             2026 non-null   float64
 6   floors_num          2191 non-null   float64
 7   year_built          1657 non-null   float64
 8   last_refurbishment  1728 non-null   float64
 9   city                2191 non-null   object 
 10  zip                 2191 non-null   int64  
 11  lat                 2191 non-null   float64
 12  lon                 2191 non-null   float64
 13  canton              2191 non-null   object 
dtypes: float64(7), int64(2), object(5)
memory usage: 239.8+ KB


In [10]:
#Data cleaning
swisshousing_data["price"] = swisshousing_data["price"].str.replace(',', '')
swisshousing_data["price"] = pd.to_numeric(swisshousing_data["price"])
swisshousing_data = swisshousing_data.drop(["Unnamed: 0", 'zip'], 1)
swisshousing_data = swisshousing_data.dropna()

  after removing the cwd from sys.path.


In [11]:
fig = px.scatter_mapbox(swisshousing_data, lat='lat', lon='lon', color='price', size='price', hover_data=['type', 'year_built', 'area_m2', 'canton'],
                          zoom=7.5, height=700,
                        labels={"price": "Housing Price"}
                        )
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# Utilities for Data and Metrics

In [12]:
def prepare_data():
    #id = 1eNTyJc4jXJMkLPXW0eY6LL7_P9YN1GWO
    warnings.filterwarnings("ignore")
    np.random.seed(42)

    # Read the home price csv file from the URL
    orig_url = "https://drive.google.com/file/d/1eNTyJc4jXJMkLPXW0eY6LL7_P9YN1GWO/view"
    file_id = orig_url.split('/')[-2]
    data_path='https://drive.google.com/uc?export=download&id=' + file_id
    
    try:
        data = pd.read_csv(data_path)
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e)
    
    #numbers are written in this format "1,235,00" converting them to integers
    data["price"] = data["price"].str.replace(',', '')
    data["price"] = pd.to_numeric(data["price"])
    data = data.drop(["Unnamed: 0", 'zip'], 1)
    data = data.dropna()

    y = data["price"]
    X = data.drop("price", 1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# Load Dataset

In [13]:
X_train, X_test, y_train, y_test = prepare_data()

data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}

data['X_train'].head()

Unnamed: 0,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,city,lat,lon,canton
276,Apartment,4.5,2,145.0,3.0,2020.0,2020.0,Mendrisio,45.8862,8.988967,Ticino
1171,Row house,4.5,4,140.0,4.0,1984.0,2017.0,Agno,46.0005,8.9028,Ticino
1894,Single house,7.5,GF,143.0,1.0,1971.0,1971.0,St-Maurice,46.1988,6.99565,Canton du Valais
117,Apartment,5.5,1,174.0,1.0,2014.0,2014.0,Cheseaux-sur-Lausanne,46.5822,6.5958,Canton de Vaud
2028,Villa,8.5,GF,400.0,3.0,1972.0,2005.0,Aigle,46.3147,6.9716,Canton de Vaud


In [14]:
data['y_train'].head()

276     1060000
1171     900000
1894     870000
117     1450000
2028    2150000
Name: price, dtype: int64

# Utilities for Modeling and Tracking Experiments

In [15]:
def train_random_forest(data, n_trees=10, max_depth=None):

    # Train and track experiment   
    with mlflow.start_run():

        categorical_features = ['type', 'floor', 'city', 'canton']
        continious_features = ['room_num', 'area_m2', 'floors_num', 'year_built', 'last_refurbishment', 'lat', 'lon']

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

        preprocessor = ColumnTransformer( transformers = [("num", numeric_transformer, continious_features),
                    ("cat", categorical_transformer, categorical_features)])
        
        # Execute RF
        rf = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth, random_state=42)
        pipeline_rf = Pipeline([("col_transformer", preprocessor), 
                            ("estimator", rf)])
        pipeline_rf.fit(data['X_train'], data['y_train'])

        # Evaluate Metrics
        predicted_qualities = pipeline_rf.predict(data['X_test'])
        (rmse, mae, r2) = eval_metrics(data['y_test'], predicted_qualities)

        # Print out metrics
        print("Random Forest model (n_estimators={}, max_depth={}):".format(n_trees, max_depth))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('Model', 'Random Forest')  # key, value
        mlflow.log_param("n_estimators", n_trees)
        mlflow.log_param("max_depth", max_depth)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(pipeline_rf, "model")

## Experiments

In [16]:
train_random_forest(data)

Random Forest model (n_estimators=10, max_depth=None):
  RMSE: 1035489.2909161543
  MAE: 468376.5597176982
  R2: 0.686580903708025


In [17]:
train_random_forest(data, n_trees=500, max_depth=None)

Random Forest model (n_estimators=500, max_depth=None):
  RMSE: 955108.4001582841
  MAE: 425444.0136091205
  R2: 0.7333512452234665


In [18]:
train_random_forest(data, n_trees=1000, max_depth=None)

Random Forest model (n_estimators=1000, max_depth=None):
  RMSE: 954532.4531740493
  MAE: 425727.21288103
  R2: 0.7336727359396507


In [19]:
train_random_forest(data, n_trees=500, max_depth=5)

Random Forest model (n_estimators=500, max_depth=5):
  RMSE: 1045381.2252462613
  MAE: 551462.5711514321
  R2: 0.6805641738635864


These models will create files in a folder named as mlruns. Which will be used by MLFLow for the UI.

## MLFLow UI

Run `mlflow ui` in terminal
<br>and view it at http://localhost:5000 in case running locally from jupyter. 
<br> In case of running in colab we will have to use ngrok tunnel.

In [20]:
!pip install pyngrok --quiet

In [47]:
from pyngrok import ngrok
from getpass import getpass

# Terminate open tunnels if exist
ngrok.kill()

# Setting the authtoken (optional)
# Get your authtoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = getpass('Enter the ngrok authtoken: ')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

Enter the ngrok authtoken: ··········
MLflow Tracking UI: https://1d1f-35-231-217-54.ngrok.io


In [None]:
!mlflow ui --port 5000

[2022-06-22 09:10:12 +0000] [2101] [INFO] Starting gunicorn 20.1.0
[2022-06-22 09:10:12 +0000] [2101] [INFO] Listening at: http://127.0.0.1:5000 (2101)
[2022-06-22 09:10:12 +0000] [2101] [INFO] Using worker: sync
[2022-06-22 09:10:12 +0000] [2104] [INFO] Booting worker with pid: 2104


# Assignments

1. Add `ElasticNet` model tracking, compare the results with `RandomForest`. 
2. Change or add parameters such as:  
 `depth` of the tree in RandomForest or   
 `l1_ratio` and `alpha` in ElasticNet.
3. Add more runs.
4. Check in MLFlow UI if the metrica are affected.
5. Convert you ML model code from work into MLFlow compatible code and run it using MLFlow API to track your experiment.
6. Explore MLFlow [GitHub examples](https://github.com/amesar/mlflow-examples).     

In [23]:
train_random_forest(data, n_trees=500, max_depth=10)

Random Forest model (n_estimators=500, max_depth=10):
  RMSE: 975367.7872181158
  MAE: 479478.1629153519
  R2: 0.7219191724272176


In [24]:
train_random_forest(data, n_trees=500, max_depth=50)

Random Forest model (n_estimators=500, max_depth=50):
  RMSE: 954982.84016705
  MAE: 425432.0617223789
  R2: 0.7334213487103876


In [29]:
def train_elastic_net(data, l1_ratio=0.5, alpha=1.0):

    # Train and track experiment   
    with mlflow.start_run():

        categorical_features = ['type', 'floor', 'city', 'canton']
        continious_features = ['room_num', 'area_m2', 'floors_num', 'year_built', 'last_refurbishment', 'lat', 'lon']

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

        preprocessor = ColumnTransformer( transformers = [("num", numeric_transformer, continious_features),
                    ("cat", categorical_transformer, categorical_features)])
        
        # Execute RF
        net = ElasticNet(l1_ratio=l1_ratio, alpha=alpha, random_state=42)
        pipeline_net = Pipeline([("col_transformer", preprocessor), ("estimator", net)])
        pipeline_net.fit(data['X_train'], data['y_train'])

        # Evaluate Metrics
        predicted_qualities = pipeline_net.predict(data['X_test'])
        (rmse, mae, r2) = eval_metrics(data['y_test'], predicted_qualities)

        # Print out metrics
        print("Elastic Net model (l1_ratio={}, alpha={}):".format(l1_ratio, alpha))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('Model', 'Elastic Net')  # key, value
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_param("alpha", alpha)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(pipeline_net, "model")

In [30]:
train_elastic_net(data)

Elastic Net model (l1_ratio=0.5, alpha=1.0):
  RMSE: 1333727.4471199298
  MAE: 704423.9706508438
  R2: 0.4800418283354526


In [45]:
for l1_ratio in np.arange(0.5, 1.0, 0.1):
  for alpha in np.arange(0, 1.0, 0.2):
    train_elastic_net(data, l1_ratio=l1_ratio, alpha=alpha)

Elastic Net model (l1_ratio=0.5, alpha=0.0):
  RMSE: 935003.6997920441
  MAE: 526648.4694232417
  R2: 0.7444588238919735
Elastic Net model (l1_ratio=0.5, alpha=0.2):
  RMSE: 1184313.9608613164
  MAE: 608519.8997720616
  R2: 0.590015037204457
Elastic Net model (l1_ratio=0.5, alpha=0.4):
  RMSE: 1240226.7994041385
  MAE: 642655.3471791445
  R2: 0.5503894910620786
Elastic Net model (l1_ratio=0.5, alpha=0.6000000000000001):
  RMSE: 1278787.5543462688
  MAE: 668568.5234436077
  R2: 0.5219965487116818
Elastic Net model (l1_ratio=0.5, alpha=0.8):
  RMSE: 1308810.2248557364
  MAE: 688183.8890245127
  R2: 0.4992884736938662
Elastic Net model (l1_ratio=0.6, alpha=0.0):
  RMSE: 935003.6997920441
  MAE: 526648.4694232417
  R2: 0.7444588238919735
Elastic Net model (l1_ratio=0.6, alpha=0.2):
  RMSE: 1168977.4701228535
  MAE: 600233.7992676425
  R2: 0.6005646359280227
Elastic Net model (l1_ratio=0.6, alpha=0.4):
  RMSE: 1220844.1271544234
  MAE: 629378.7203080109
  R2: 0.5643329978569547
Elastic Net 

In [46]:
for l1_ratio in np.arange(0.5, 1.05, 0.1):
    train_elastic_net(data, l1_ratio=l1_ratio, alpha=1.0)

Elastic Net model (l1_ratio=0.5, alpha=1.0):
  RMSE: 1333727.4471199298
  MAE: 704423.9706508438
  R2: 0.4800418283354526
Elastic Net model (l1_ratio=0.6, alpha=1.0):
  RMSE: 1308810.5420637203
  MAE: 688184.1842573648
  R2: 0.4992882309853207
Elastic Net model (l1_ratio=0.7, alpha=1.0):
  RMSE: 1278788.3193851165
  MAE: 668569.216844611
  R2: 0.5219959767772464
Elastic Net model (l1_ratio=0.7999999999999999, alpha=1.0):
  RMSE: 1240228.4073029882
  MAE: 642656.7619506696
  R2: 0.5503883252612924
Elastic Net model (l1_ratio=0.8999999999999999, alpha=1.0):
  RMSE: 1184317.4452993446
  MAE: 608522.5177832306
  R2: 0.5900126247203623
Elastic Net model (l1_ratio=0.9999999999999999, alpha=1.0):
  RMSE: 912475.452430428
  MAE: 500309.7008441778
  R2: 0.7566246381876989
