## Data
- We will be using NYC taxi data
- These have recently been changed to **parquet** files
- We will be using Pandas to read this data in via `pd.read_parquet()` command
    - This requires instaling PyArrow via `pip install pyarrow` on the VM
    - Also potentiall have to `pip install seaborn` and `pip install scikit-learn`
- In the `week1/` directory, run `mkdir data`, then `cd` into it 
- Download the Green January and February 2021 parquet data file via `wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet` and `wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet`

In [18]:
import os
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials  # metrics
from hyperopt.pyll import scope

from config import mlflow_model_uri

In [2]:
# set the MLFlow URI to our backend
mlflow.set_tracking_uri('sqlite:///mlflow.db')

# set up to assign/append runs to our experiment (and create if it doesn't exist)
mlflow.set_experiment('nyc_taxi_experiment_1')

<Experiment: artifact_location='file:///C:/Users/nimz/Documents/mlops_zoomcamp/week2_experiment_tracking/mlruns/1', creation_time=1684358646144, experiment_id='1', last_update_time=1684358646144, lifecycle_stage='active', name='nyc_taxi_experiment_1', tags={}>

## 1. Load, inspect, and clean data

In [3]:
# create helper function to read and clean data
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        
        # do some data conversion if CSV
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    
    # create duration in minutes column
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    # filter to only trips between 1 minute and 1 hour
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    # specify categorical input features and convert to String
    # for one-hot encoding via Dictionary Vectorizer
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
# create DataFrames for training and validation
df_train = read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2021-02.parquet')

# see how many samples we have
len(df_train), len(df_val)

(73908, 61921)

In [5]:
# create a feature combining pickup and drop-off locations (feature engineering)
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [6]:
# specify our input features
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

## 2. Create training and validation sets

In [7]:
# create training and validation input feature sets
dv = DictVectorizer()

# turn each row into dictionary and create the training set
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# turn each row into dictionary and create the validation set
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
# create training and validation label sets
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

## 3. MLFLow Model Management

In [9]:
# create dataset for XGBoost
# DMatrix = internal XGBoost data structure optimized for both memory efficiency and training speed
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.DMatrix
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [10]:
# def objective(params):
#     '''
#     Defines an objective function that will be minimized given the provided hyperparameters
#     '''
#     with mlflow.start_run():
#         # set our tages and log our parameters (search space)
#         mlflow.set_tag("model", "xgboost")
#         mlflow.log_params(params)
        
#         # train and generate our model
#         booster = xgb.train(
#             params=params,
#             dtrain=train,
#             num_boost_round=1000,  # iterations of the booster
#             evals=[(valid, 'validation')],  # xgboost tries to minimze error here on this set
#             early_stopping_rounds=50,   # stop if 50+ iterations go by with no improvement
#             verbose_eval=False  # suppress output
#         )
        
#         # get predictions
#         y_pred = booster.predict(valid)
        
#         # get error and log it
#         rmse = mean_squared_error(y_val, y_pred, squared=False)
#         # print(f'Validation RMSE: {rmse}')
#         mlflow.log_metric("rmse", rmse)
    
#     # return the error to `hyperopt` and send an alert/signal that
#     #   the objective function has succesfully run
#     return {'loss': rmse, 'status': STATUS_OK}

# # define the search space (our hyperparameters and various values) using `hp` ranges
# # https://hyperopt.github.io/hyperopt/getting-started/search_spaces/
# search_space = {
#     'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),  # define an interger range with `scope`
#     'learning_rate': hp.loguniform('learning_rate', -3, 0),
#     'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
#     'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
#     'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
#     'objective': 'reg:linear',
#     'seed': 42
# }

# # minimize the defined objective function over the defined search space
# # https://hyperopt.github.io/hyperopt/getting-started/minimizing_functions/
# # i.e., find the best hyperparemeters
# # i.e., explore a given function over a hyperparameter space according to a given algorithm, 
# #   allowing up to a certain number of function evaluations
# best_result = fmin(
#     fn=objective,
#     space=search_space,
#     algo=tpe.suggest,  # Given previous trials + the domain, suggest best expected hp point according to TPE-EI algorithm
#     max_evals=50,
#     trials=Trials()  # keep track of information from each run
# )

In [15]:
best_result = {
    'learning_rate': 0.13642747972651512,
    'max_depth': 21.0,
    'min_child_weight': 1.5655550191042376,
    'reg_alpha': 0.009403472263570046,
    'reg_lambda': 0.005322134643445022
}

In [13]:
# Turn off autologging
# https://mlflow.org/docs/latest/tracking.html#automatic-logging
# https://mlflow.org/docs/latest/python_api/mlflow.xgboost.html#mlflow.xgboost.autolog
mlflow.xgboost.autolog(disable=True)

In [17]:
# create a specific run for our hyperparameters
with mlflow.start_run():
    
    # create dataset for XGBoost
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    # set our parameters and log them
    best_params = {
        'learning_rate': best_result['learning_rate'],
        'max_depth': int(best_result['max_depth']),
        'min_child_weight': best_result['min_child_weight'],
        'reg_alpha': best_result['reg_alpha'],
        'reg_lambda': best_result['reg_lambda'],
        'objective': 'reg:linear',  # manually set
        'seed': 42
    }
    
    mlflow.log_params(best_params)
    
    # train and create an XGBoost model with these parameters
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False  # suppress output
    )

    # make predictions, then calculate and log the error
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric('rmse', rmse)
    
    # create model directory if it doesn't already exist
    os.makedirs(os.path.dirname('./models/'), mode=0o755, exist_ok=True)

    # save this the preprocessor to our best model locally
    with open('./models/preprocessor.b', 'wb') as f_out:
        pickle.dump(dv, f_out)
    
    # log this preprocessor as an experiment artifact
    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')
    
    # log the actual mode itself
    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')





## 3. MLFLow Load Models

In [19]:
logged_model = mlflow_model_uri

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.




In [22]:
# # check model
# loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: d4535c4d635f4305940cfbd71b0eb91a

In [23]:
# can load model in 2 different places as different objects

# load model as xgboost object
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model



<xgboost.core.Booster at 0x1eaee78ab50>

In [26]:
# # we have access to attributes
# dir(xgboost_model)

In [27]:
# make predictions with this loaded model
y_pred = xgboost_model.predict(valid)

# check 1st 10 predictions
y_pred[:10]

array([15.17295 ,  7.193073, 13.691938, 24.373695,  9.104835, 17.14612 ,
       11.797985,  9.013244,  8.948696, 19.844584], dtype=float32)

In [28]:
mean_squared_error(y_val, y_pred, squared=False)

6.296879486713688