In [2]:
import pandas as pd

df = pd.read_csv('../data/processed/data.csv')
df.head()

Unnamed: 0,size,year,garage,price
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


In [2]:
X = df.drop('price', axis = 1)
y = df['price'].copy()

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

## Set MLFlow Experiment

In [21]:
import mlflow
mlflow.set_experiment('house-prices-eda')

INFO: 'house-prices-eda' does not exist. Creating a new experiment


# Linear Regression

In [22]:
mlflow.start_run() # Starts execution

<ActiveRun: >

In [23]:
from sklearn.linear_model import LinearRegression

linearRegressionModel =  LinearRegression()

linearRegressionModel.fit(X_train, y_train)

LinearRegression()

In [29]:
# Dumps model into MLFlow
mlflow.sklearn.log_model(linearRegressionModel, 'LinearRegressionModel')

In [30]:
X_test_prediction = linearRegressionModel.predict(X_test)

In [31]:
from sklearn.metrics import mean_squared_error, r2_score
import math

mse = mean_squared_error(y_test, X_test_prediction)
rmse = math.sqrt(mse)
r2 = r2_score(y_test, X_test_prediction)

print('mse', mse)
print('rmse', rmse)
print('r2', r2)

mse 2078666917.9289908
rmse 45592.39978251848
r2 0.7021153642898048


In [32]:
# Logs model metric into ML Flow

mlflow.log_metric('mse', mse)
mlflow.log_metric('rmse', rmse)
mlflow.log_metric('r2', r2)

In [34]:
# Manual approach to finish execution
mlflow.end_run()

In [39]:
from xgboost import XGBRFRegressor, XGBRegressor

with mlflow.start_run():    
    xgb = XGBRFRegressor(random_state = 42)     
    xgb.fit(X_train, y_train)

    mlflow.xgboost.log_model(xgb, 'xgboost rf')

    xgb_predicted = xgb.predict(X_test)

    mse = mean_squared_error(y_test, xgb_predicted)
    rmse = math.sqrt(mse)
    r2 = r2_score(y_test, xgb_predicted)

    print('mse', mse)
    print('rmse', rmse)
    print('r2', r2)

    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)

mse 1318418524.4896371
rmse 36310.03338596148
r2 0.81106322590997


In [40]:
xgb_params = {
    'learning_rate': 0.2,
    'n_estimators': 50,
    'random_state': 42
}

with mlflow.start_run():        
    xgb = XGBRegressor(**xgb_params)
    xgb.fit(X_train, y_train)

    mlflow.xgboost.log_model(xgb, 'xgboost')

    xgb_predicted = xgb.predict(X_test)

    mse = mean_squared_error(y_test, xgb_predicted)
    rmse = math.sqrt(mse)
    r2 = r2_score(y_test, xgb_predicted)

    print('mse', mse)
    print('rmse', rmse)
    print('r2', r2)

    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)

mse 1386727460.1346002
rmse 37238.789724353286
r2 0.8012741720529797


# Consuming existing experiments with MLFlow Tracking API

In [41]:
mlflow.get_experiment_by_name('house-prices-eda')

<Experiment: artifact_location='file:///C:/Users/PedroFerreira/PycharmProjects/data-science-studies/Machine%20Learning/MLOps/mlflow/mlflow/notebooks/mlruns/1', experiment_id='1', lifecycle_stage='active', name='house-prices-eda', tags={}>

In [42]:
mlflow.list_run_infos(experiment_id='1')

[<RunInfo: artifact_uri='file:///C:/Users/PedroFerreira/PycharmProjects/data-science-studies/Machine%20Learning/MLOps/mlflow/mlflow/notebooks/mlruns/1/7bc6caff5624419ba52c3328a177f451/artifacts', end_time=1623248845736, experiment_id='1', lifecycle_stage='active', run_id='7bc6caff5624419ba52c3328a177f451', run_uuid='7bc6caff5624419ba52c3328a177f451', start_time=1623248845518, status='FINISHED', user_id='PedroFerreira'>,
 <RunInfo: artifact_uri='file:///C:/Users/PedroFerreira/PycharmProjects/data-science-studies/Machine%20Learning/MLOps/mlflow/mlflow/notebooks/mlruns/1/fe7a5c3d8cf7441e9fbeac94050bd2c5/artifacts', end_time=1623248843732, experiment_id='1', lifecycle_stage='active', run_id='fe7a5c3d8cf7441e9fbeac94050bd2c5', run_uuid='fe7a5c3d8cf7441e9fbeac94050bd2c5', start_time=1623248843461, status='FINISHED', user_id='PedroFerreira'>,
 <RunInfo: artifact_uri='file:///C:/Users/PedroFerreira/PycharmProjects/data-science-studies/Machine%20Learning/MLOps/mlflow/mlflow/notebooks/mlruns/1/a

In [43]:
mlflow.get_run('ab9c4201473c4e32b5f4fc3b113163dd')

<Run: data=<RunData: metrics={'mse': 2078666917.9289908, 'r2': 0.7021153642898048, 'rmse': 45592.39978251848}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "ab9c4201473c4e32b5f4fc3b113163dd", '
                             '"artifact_path": "LinearRegressionModel", '
                             '"utc_time_created": "2021-06-09 '
                             '14:10:43.584596", "flavors": {"python_function": '
                             '{"model_path": "model.pkl", "loader_module": '
                             '"mlflow.sklearn", "python_version": "3.8.5", '
                             '"env": "conda.yaml"}, "sklearn": '
                             '{"pickled_model": "model.pkl", '
                             '"sklearn_version": "0.23.2", '
                             '"serialization_format": "cloudpickle"}}}, '
                             '{"run_id": "ab9c4201473c4e32b5f4fc3b113163dd", '
                             '"artifact_path": "LinearRegressionModel", '
    