# Install Dependencies

In [None]:
!pip install mlflow --quiet

[K     |████████████████████████████████| 17.8 MB 531 kB/s 
[K     |████████████████████████████████| 79 kB 3.7 MB/s 
[K     |████████████████████████████████| 146 kB 24.9 MB/s 
[K     |████████████████████████████████| 62 kB 705 kB/s 
[K     |████████████████████████████████| 181 kB 41.9 MB/s 
[K     |████████████████████████████████| 596 kB 39.0 MB/s 
[K     |████████████████████████████████| 209 kB 22.5 MB/s 
[K     |████████████████████████████████| 54 kB 1.0 MB/s 
[K     |████████████████████████████████| 63 kB 959 kB/s 
[K     |████████████████████████████████| 78 kB 2.4 MB/s 
[?25h  Building wheel for databricks-cli (setup.py) ... [?25l[?25hdone


# Load Dependencies

In [None]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
import mlflow
import mlflow.sklearn

In [None]:
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# Utilities for Data and Metrics

In [None]:
def prepare_data():
    #id = 1eNTyJc4jXJMkLPXW0eY6LL7_P9YN1GWO
    warnings.filterwarnings("ignore")
    np.random.seed(42)

    # Read the home price csv file from the URL
    orig_url = "https://drive.google.com/file/d/1eNTyJc4jXJMkLPXW0eY6LL7_P9YN1GWO/view"
    file_id = orig_url.split('/')[-2]
    data_path='https://drive.google.com/uc?export=download&id=' + file_id
    
    try:
        data = pd.read_csv(data_path)
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e)
    
    #numbers are written in this format "1,235,00" converting them to integers
    data["price"] = data["price"].str.replace(',', '')
    data["price"] = pd.to_numeric(data["price"])
    data = data.drop(["Unnamed: 0", 'zip'], 1)
    data = data.dropna()

    y = data["price"]
    X = data.drop("price", 1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

# Load Dataset

In [None]:
X_train, X_test, y_train, y_test = prepare_data()

data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}

data['X_train'].head()

Unnamed: 0,type,room_num,floor,area_m2,floors_num,year_built,last_refurbishment,city,lat,lon,canton
276,Apartment,4.5,2,145.0,3.0,2020.0,2020.0,Mendrisio,45.8862,8.988967,Ticino
1171,Row house,4.5,4,140.0,4.0,1984.0,2017.0,Agno,46.0005,8.9028,Ticino
1894,Single house,7.5,GF,143.0,1.0,1971.0,1971.0,St-Maurice,46.1988,6.99565,Canton du Valais
117,Apartment,5.5,1,174.0,1.0,2014.0,2014.0,Cheseaux-sur-Lausanne,46.5822,6.5958,Canton de Vaud
2028,Villa,8.5,GF,400.0,3.0,1972.0,2005.0,Aigle,46.3147,6.9716,Canton de Vaud


In [None]:
data['y_train'].head()

276     1060000
1171     900000
1894     870000
117     1450000
2028    2150000
Name: price, dtype: int64

# Utilities for Modeling and Tracking Experiments

In [None]:
def train_elasticnet(data, alpha=0.5, l1_ratio=0.5):

    # Train and track experiment
    with mlflow.start_run():

        categorical_features = ['type', 'floor', 'city', 'canton']
        continious_features = ['room_num', 'area_m2', 'floors_num', 'year_built', 'last_refurbishment', 'lat', 'lon']

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

        preprocessor = ColumnTransformer( transformers = [("num", numeric_transformer, continious_features),
                    ("cat", categorical_transformer, categorical_features)])

        # Execute ElasticNet
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        pipeline_lr = Pipeline([("col_transformer", preprocessor), 
                            ("estimator", lr)])
        pipeline_lr.fit(data['X_train'], data['y_train'])

        # Evaluate Metrics
        predicted_qualities = pipeline_lr.predict(data['X_test'])
        (rmse, mae, r2) = eval_metrics(data['y_test'], predicted_qualities)

        # Print out metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('Model', 'ElasticNet')  
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(pipeline_lr, "model")

In [None]:
def train_random_forest(data, n_trees=100, max_depth=None):

    # Train and track experiment   
    with mlflow.start_run():

        categorical_features = ['type', 'floor', 'city', 'canton']
        continious_features = ['room_num', 'area_m2', 'floors_num', 'year_built', 'last_refurbishment', 'lat', 'lon']

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

        preprocessor = ColumnTransformer( transformers = [("num", numeric_transformer, continious_features),
                    ("cat", categorical_transformer, categorical_features)])
        
        # Execute RF
        rf = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth, random_state=42)
        pipeline_rf = Pipeline([("col_transformer", preprocessor), 
                            ("estimator", rf)])
        pipeline_rf.fit(data['X_train'], data['y_train'])

        # Evaluate Metrics
        predicted_qualities = pipeline_rf.predict(data['X_test'])
        (rmse, mae, r2) = eval_metrics(data['y_test'], predicted_qualities)

        # Print out metrics
        print("Random Forest model (n_estimators={}, max_depth={}):".format(n_trees, max_depth))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param('Model', 'Random Forest')  
        mlflow.log_param("n_estimators", n_trees)
        mlflow.log_param("max_depth", max_depth)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(pipeline_rf, "model")

## Experiments

In [None]:
train_elasticnet(data)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 1260914.8144902154
  MAE: 656693.7806125223
  R2: 0.5352646319878015


In [None]:
train_elasticnet(data, alpha=0.2, l1_ratio=0.2)

Elasticnet model (alpha=0.200000, l1_ratio=0.200000):
  RMSE: 1220843.5004088164
  MAE: 629378.2520055452
  R2: 0.5643334451741895


In [None]:
train_elasticnet(data, alpha=0.1, l1_ratio=0.1)

Elasticnet model (alpha=0.100000, l1_ratio=0.100000):
  RMSE: 1176915.5183705238
  MAE: 604485.1153140196
  R2: 0.5951214124898465


In [None]:
train_elasticnet(data, alpha=0.7, l1_ratio=0.3)

Elasticnet model (alpha=0.700000, l1_ratio=0.300000):
  RMSE: 1331409.2097781317
  MAE: 702940.4117721308
  R2: 0.4818478028207511


In [None]:
train_random_forest(data)

Random Forest model (n_estimators=100, max_depth=None):
  RMSE: 946901.6327767096
  MAE: 427173.6942019544
  R2: 0.7379139162588411


In [None]:
train_random_forest(data, n_trees=500, max_depth=None)

Random Forest model (n_estimators=500, max_depth=None):
  RMSE: 955108.4001582841
  MAE: 425444.0136091205
  R2: 0.7333512452234665


In [None]:
train_random_forest(data, n_trees=1000, max_depth=None)

Random Forest model (n_estimators=1000, max_depth=None):
  RMSE: 954532.4531740493
  MAE: 425727.21288103
  R2: 0.7336727359396507


In [None]:
train_random_forest(data, n_trees=500, max_depth=5)

Random Forest model (n_estimators=500, max_depth=5):
  RMSE: 1045381.2252462613
  MAE: 551462.5711514321
  R2: 0.6805641738635864


These models will create files in a folder named as mlruns. Which will be used by MLFLow for the UI.

## MLFLow UI

Run `mlflow ui` in terminal
<br>and view it at http://localhost:5000 in case running locally from jupyter. 
<br> In case of running in colab we will have to use ngrok tunnel.

In [None]:
# !mlflow ui --port 5000

In [None]:
!pip install pyngrok --quiet

[?25l[K     |▍                               | 10 kB 19.5 MB/s eta 0:00:01[K     |▉                               | 20 kB 16.2 MB/s eta 0:00:01[K     |█▎                              | 30 kB 6.9 MB/s eta 0:00:01[K     |█▊                              | 40 kB 6.2 MB/s eta 0:00:01[K     |██▏                             | 51 kB 4.4 MB/s eta 0:00:01[K     |██▋                             | 61 kB 5.1 MB/s eta 0:00:01[K     |███                             | 71 kB 5.4 MB/s eta 0:00:01[K     |███▌                            | 81 kB 4.8 MB/s eta 0:00:01[K     |████                            | 92 kB 5.3 MB/s eta 0:00:01[K     |████▍                           | 102 kB 5.3 MB/s eta 0:00:01[K     |████▉                           | 112 kB 5.3 MB/s eta 0:00:01[K     |█████▎                          | 122 kB 5.3 MB/s eta 0:00:01[K     |█████▊                          | 133 kB 5.3 MB/s eta 0:00:01[K     |██████▏                         | 143 kB 5.3 MB/s eta 0:00:01[K   

In [None]:
from pyngrok import ngrok
from getpass import getpass

# Terminate open tunnels if exist
ngrok.kill()

# Setting the authtoken (optional)
# Get your authtoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = getpass('Enter the ngrok authtoken: ')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

Enter the ngrok authtoken: ··········
MLflow Tracking UI: https://fb22-34-73-141-57.ngrok.io


In [28]:
!mlflow ui --port 5000

[2022-06-09 13:37:26 +0000] [379] [INFO] Starting gunicorn 20.1.0
[2022-06-09 13:37:26 +0000] [379] [INFO] Listening at: http://127.0.0.1:5000 (379)
[2022-06-09 13:37:26 +0000] [379] [INFO] Using worker: sync
[2022-06-09 13:37:26 +0000] [382] [INFO] Booting worker with pid: 382
[2022-06-09 14:38:36 +0000] [379] [INFO] Handling signal: int

Aborted!
[2022-06-09 14:38:37 +0000] [382] [INFO] Worker exiting (pid: 382)
[2022-06-09 14:38:37 +0000] [379] [INFO] Shutting down: Master
