# Prerequisites

In [1]:
%pip install mlrun shapely

Note: you may need to restart the kernel to use updated packages.


# Create an MLRun project and configuration

In [2]:
from os import path, getenv
from mlrun import new_project

project_name = '-'.join(filter(None, ['nyc-taxi', getenv('V3IO_USERNAME', None)]))
project_path = path.abspath('conf')
project = new_project(project_name, project_path, init_git=True)

print(f'Project path: {project_path}\nProject name: {project_name}')

Project path: /User/taxi/conf
Project name: nyc-taxi-edmond


In [3]:
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io

# Target location for storing pipeline artifacts
artifact_path = path.abspath('jobs')
# MLRun DB path or API service URL
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

print(f'Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}')

Artifacts path: /User/taxi/jobs
MLRun DB path: http://mlrun-api:8080


# Define Nuclio and MLRun functions

In [4]:
import nuclio

In [5]:
# nuclio: start-code

In [6]:
%nuclio cmd -c pip install lightgbm shapely

In [7]:
%nuclio config spec.build.baseImage = "mlrun/mlrun"
%nuclio config spec.image = "mlrun/ml-models"
%nuclio config kind = "job"

%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
%nuclio: setting spec.image to 'mlrun/ml-models'
%nuclio: setting kind to 'job'


In [8]:
import numpy as np 
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc
from os import path, getenv
from mlrun.run import get_dataitem
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import get_model, update_model
from pickle import dumps
import shapely.wkt

In [9]:
def get_zones_dict(zones_url):
    zones_df = pd.read_csv(zones_url)
    
    # Remove unecessary fields
    zones_df.drop(['Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough'], axis=1, inplace=True)
    
    # Convert DF to dictionary
    zones_dict = zones_df.set_index('OBJECTID').to_dict('index')
    
    # Add lat/long to each zone
    for zone in zones_dict:
        shape = shapely.wkt.loads(zones_dict[zone]['the_geom'])
        zones_dict[zone]['long'] = shape.centroid.x
        zones_dict[zone]['lat'] = shape.centroid.y
    
    return zones_dict

In [10]:
def get_zone_lat(zones_dict, zone_id):
    return zones_dict[zone_id]['lat']

In [11]:
def get_zone_long(zones_dict, zone_id):
    return zones_dict[zone_id]['long']

In [12]:
def clean_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
             (df.PULocationID > 0) & (df.PULocationID <= 263) & 
             (df.DOLocationID > 0) & (df.DOLocationID <= 263)]

In [13]:
# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [14]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)

In [15]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
    
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset

In [16]:
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['tpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

In [17]:
def fetch_data(context : MLClientCtx, taxi_records_csv_path, zones_csv_path):
    
    context.logger.info('Reading taxi records data from {}'.format(taxi_records_csv_path))
    taxi_records_dataset = taxi_records_csv_path.as_df()
    
    context.logger.info('Reading zones data from {}'.format(zones_csv_path))
    zones_dataset = zones_csv_path.as_df()
    
    target_path = path.join(context.artifact_path, 'data')
    context.logger.info('Saving datasets to {} ...'.format(target_path))

    # Store the data sets in your artifacts database
    context.log_dataset('nyc-taxi-dataset', df=taxi_records_dataset, format='csv',
                        index=False, artifact_path=target_path)
    context.log_dataset('zones-dataset', df=zones_dataset, format='csv',
                        index=False, artifact_path=target_path)    

In [18]:
def get_zones_dict(zones_df):

    # Remove unecessary fields
    zones_df.drop(['Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough'], axis=1, inplace=True)
    
    # Convert DF to dictionary
    zones_dict = zones_df.set_index('OBJECTID').to_dict('index')
    
    # Add lat/long to each zone
    for zone in zones_dict:
        shape = shapely.wkt.loads(zones_dict[zone]['the_geom'])
        zones_dict[zone]['long'] = shape.centroid.x
        zones_dict[zone]['lat'] = shape.centroid.y
    
    return zones_dict

In [19]:
def get_zone_lat(zones_dict, zone_id):
    return zones_dict[zone_id]['lat']

In [20]:
def get_zone_long(zones_dict, zone_id):
    return zones_dict[zone_id]['long']

In [21]:
def transform_dataset(context : MLClientCtx, taxi_records_csv_path: str, zones_csv_path: str):
    
    context.logger.info('Begin datasets transform')
    
    context.logger.info('zones_csv_path: ' + str(zones_csv_path))
    
    zones_df = get_dataitem(zones_csv_path).as_df()    
    
    # Get zones dictionary
    zones_dict = get_zones_dict(zones_df)
    
    train_df = get_dataitem(taxi_records_csv_path).as_df()
    
    # Clean DF
    train_df = clean_df(train_df)
    
    # Enrich DF
    train_df['pickup_latitude'] = train_df.apply(lambda x: get_zone_lat(zones_dict, x['PULocationID']), axis=1 )
    train_df['pickup_longitude'] = train_df.apply(lambda x: get_zone_long(zones_dict, x['PULocationID']), axis=1 )
    train_df['dropoff_latitude'] = train_df.apply(lambda x: get_zone_lat(zones_dict, x['DOLocationID']), axis=1 )
    train_df['dropoff_longitude'] = train_df.apply(lambda x: get_zone_long(zones_dict, x['DOLocationID']), axis=1 )

    train_df = add_datetime_info(train_df)
    train_df = add_airport_dist(train_df)

    train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
    train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
    train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
    train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])

    train_df.drop(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'congestion_surcharge', 'improvement_surcharge', 'pickup_datetime',
                  'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount', 'RatecodeID', 'store_and_fwd_flag',
                  'PULocationID', 'DOLocationID', 'payment_type'], 
                  axis=1, inplace=True, errors='ignore')
    
    # Save dataset to artifact
    target_path = path.join(context.artifact_path, 'data')
    context.log_dataset('nyc-taxi-dataset-transformed', df=train_df, artifact_path=target_path, format='csv')    
    
    context.logger.info('End dataset transform')

In [22]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

In [23]:
def train_model(context: MLClientCtx, input_ds: str, model_path: str):
    
    context.logger.info('Begin training')
    context.logger.info('LGBM version is ' + str(lgbm.__version__))
    
    train_df = get_dataitem(input_ds).as_df()
    
    y = train_df['fare_amount']
  
    train_df = train_df.drop(columns=['fare_amount'])
    train_df = train_df.drop(train_df.columns[[0]], axis=1)
    x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)
    
    train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
    valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
    model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)
    
    context.log_model('FareModel',
                     body=dumps(model),
                     artifact_path=context.artifact_subpath("models"),
                     model_file="FareModel.pkl")
    
    context.logger.info('End training')

In [24]:
# nuclio: end-code

## Run fetch_data locally

In [25]:
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io

fetch_data_run = run_local(name='fetch_data',
                         handler=fetch_data,
                         inputs={'taxi_records_csv_path': 'https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv',
                                 'zones_csv_path': 'https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv'},
                         project=project_name, artifact_path=artifact_path)

> 2020-11-15 10:28:31,544 [info] starting run fetch_data uid=38051af0319e46ebbe90ecd84a44cc64  -> http://mlrun-api:8080
> 2020-11-15 10:28:31,622 [info] Reading taxi records data from https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv
> 2020-11-15 10:28:48,188 [info] Reading zones data from https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv
> 2020-11-15 10:28:50,165 [info] Saving datasets to /User/taxi/jobs/data ...


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...4a44cc64,0,Nov 15 10:28:31,completed,fetch_data,v3io_user=edmondkind=handlerowner=edmondhost=jupyter-edmond-5b458fc996-n55jn,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-datasetzones-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 38051af0319e46ebbe90ecd84a44cc64 --project nyc-taxi-edmond , !mlrun logs 38051af0319e46ebbe90ecd84a44cc64 --project nyc-taxi-edmond
> 2020-11-15 10:29:06,478 [info] run executed, status=completed


In [26]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://nyc-taxi-edmond/fetch_data_nyc-taxi-dataset#38051af0319e46ebbe90ecd84a44cc64',
 'zones-dataset': 'store://nyc-taxi-edmond/fetch_data_zones-dataset#38051af0319e46ebbe90ecd84a44cc64'}

## Run fetch_data on cluster

#### Prepare cluster function

Create an MLRun function and create a custom image for it (that uses shapely).

In [27]:
from mlrun import code_to_function

# Convert the local ingest_data_to_dataset function into a gen_func project function
gen_func = code_to_function(name='cluster-function')
project.set_function(gen_func)
cluster_func = project.func('cluster-function').apply(mount_v3io())
cluster_func.deploy()

> 2020-11-15 10:29:14,906 [info] starting remote build, image: .mlrun/func-nyc-taxi-edmond-cluster-function-latest
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.5.3-rc1 to mlrun/mlrun:0.5.3-rc1 
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.5.3-rc1 to mlrun/mlrun:0.5.3-rc1 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.5.3-rc1 
[36mINFO[0m[0002] Retrieving image manifest mlrun/mlrun:0.5.3-rc1 
[36mINFO[0m[0004] Built cross stage deps: map[]                
[36mINFO[0m[0004] Retrieving image manifest mlrun/mlrun:0.5.3-rc1 
[36mINFO[0m[0005] Retrieving image manifest mlrun/mlrun:0.5.3-rc1 
[36mINFO[0m[0006] Unpacking rootfs as cmd RUN pip install lightgbm shapely requires it. 
[36mINFO[0m[0053] Taking snapshot of full filesystem...        
[36mINFO[0m[0053] Resolving paths                              
[36mINFO[0m[0061] RUN pip install lightgbm shapely             
[36mINFO[0m[0061] cmd: /bin/sh                                 
[36mINFO[0m[

True

In [28]:
fetch_data_run = cluster_func.run(name='fetch_data',
                                 handler='fetch_data',
                                 inputs={'taxi_records_csv_path': 'https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv',
                                         'zones_csv_path': 'https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv'},
                                 artifact_path=artifact_path)

> 2020-11-15 10:30:48,915 [info] starting run fetch_data uid=4cd59c4be88646918c87f7bb7e698ca5  -> http://mlrun-api:8080
> 2020-11-15 10:30:49,044 [info] Job is running in the background, pod: fetch-data-ng65j
> 2020-11-15 10:30:55,208 [info] Reading taxi records data from https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv
> 2020-11-15 10:31:03,450 [info] Reading zones data from https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv
> 2020-11-15 10:31:05,404 [info] Saving datasets to /User/taxi/jobs/data ...
> 2020-11-15 10:31:21,177 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...7e698ca5,0,Nov 15 10:30:55,completed,fetch_data,v3io_user=edmondkind=jobowner=edmondhost=fetch-data-ng65j,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-datasetzones-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 4cd59c4be88646918c87f7bb7e698ca5 --project nyc-taxi-edmond , !mlrun logs 4cd59c4be88646918c87f7bb7e698ca5 --project nyc-taxi-edmond
> 2020-11-15 10:31:28,461 [info] run executed, status=completed


In [29]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://nyc-taxi-edmond/fetch_data_nyc-taxi-dataset#4cd59c4be88646918c87f7bb7e698ca5',
 'zones-dataset': 'store://nyc-taxi-edmond/fetch_data_zones-dataset#4cd59c4be88646918c87f7bb7e698ca5'}

## Run transform_dataset

In [30]:
transform_dataset_run = cluster_func.run(name='transform_dataset',
                                 handler='transform_dataset',
                                 inputs={'taxi_records_csv_path': fetch_data_run.outputs['nyc-taxi-dataset'],
                                        'zones_csv_path': fetch_data_run.outputs['zones-dataset']},
                                 artifact_path=artifact_path)

> 2020-11-15 10:31:31,653 [info] starting run transform_dataset uid=172cc1a5e8aa424f88868dd20e02d15e  -> http://mlrun-api:8080
> 2020-11-15 10:31:31,793 [info] Job is running in the background, pod: transform-dataset-7pdxr
> 2020-11-15 10:31:37,701 [info] Begin datasets transform
> 2020-11-15 10:31:37,701 [info] zones_csv_path: /User/taxi/jobs/data/zones-dataset.csv
> 2020-11-15 10:32:57,381 [info] End dataset transform
> 2020-11-15 10:32:57,465 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...0e02d15e,0,Nov 15 10:31:37,completed,transform_dataset,v3io_user=edmondkind=jobowner=edmondhost=transform-dataset-7pdxr,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-dataset-transformed


to track results use .show() or .logs() or in CLI: 
!mlrun get run 172cc1a5e8aa424f88868dd20e02d15e --project nyc-taxi-edmond , !mlrun logs 172cc1a5e8aa424f88868dd20e02d15e --project nyc-taxi-edmond
> 2020-11-15 10:33:01,425 [info] run executed, status=completed


In [31]:
transform_dataset_run.outputs

{'nyc-taxi-dataset-transformed': 'store://nyc-taxi-edmond/transform_dataset_nyc-taxi-dataset-transformed#172cc1a5e8aa424f88868dd20e02d15e'}

## Train model

In [32]:
train_model_run = cluster_func.run(name='train_model',
                                    handler='train_model',
                                    inputs={'input_ds': transform_dataset_run.outputs['nyc-taxi-dataset-transformed'], 
                                            'model_path': path.abspath("")},
                                    artifact_path=artifact_path)

> 2020-11-15 10:33:05,629 [info] starting run train_model uid=29bd9c8600ac44848aa5f9ad5872adc8  -> http://mlrun-api:8080
> 2020-11-15 10:33:05,775 [info] Job is running in the background, pod: train-model-n4qls
> 2020-11-15 10:33:11,627 [info] Begin training
> 2020-11-15 10:33:11,627 [info] LGBM version is 3.0.0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23961
[LightGBM] [Info] Number of data points in the train set: 879294, number of used features: 16
[LightGBM] [Info] Start training from score 12.418691
Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.01293
[1000]	valid_0's rmse: 2.98584
[1500]	valid_0's rmse: 2.98288
[2000]	valid_0's rmse: 2.9866
Early stopping, best iteration is:
[1551]	valid_0's rmse: 2.98222
> 2020-11-15 10:34:07,706 [info] End training
> 2020-11-15 10:34:07,783 [info] run executed, status=completed
final state: complete

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...5872adc8,0,Nov 15 10:33:11,completed,train_model,v3io_user=edmondkind=jobowner=edmondhost=train-model-n4qls,input_dsmodel_path,,,FareModel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 29bd9c8600ac44848aa5f9ad5872adc8 --project nyc-taxi-edmond , !mlrun logs 29bd9c8600ac44848aa5f9ad5872adc8 --project nyc-taxi-edmond
> 2020-11-15 10:34:14,433 [info] run executed, status=completed


In [33]:
train_model_run.outputs

{'FareModel': 'store://nyc-taxi-edmond/train_model_FareModel#29bd9c8600ac44848aa5f9ad5872adc8'}

## Serving

The model serving class is in model_serving_lightgbm.ipynb.

In [34]:
serving = project.set_function(path.abspath('.') + '/model_serving_lightgbm.ipynb', name='taxi-serving').apply(mount_v3io())
serving.spec.default_class = 'LGBMModel'
serving.add_model('taxi-serving', train_model_run.outputs['FareModel'])
get_data_run = serving.deploy()

> 2020-11-15 10:34:42,418 [info] deploy started
[nuclio] 2020-11-15 10:34:44,653 (info) Build complete
[nuclio] 2020-11-15 10:34:59,919 done updating nyc-taxi-edmond-taxi-serving, function address: 192.168.224.70:30319


In [35]:
my_data = '''{"inputs":[[5.1, 3.5, 1.4, 3, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2]]}'''
serving.invoke('/v2/models/taxi-serving/predict', my_data)

{'id': '39556d32-36d3-4919-a1cb-e9a6990ff8a7',
 'model_name': 'taxi-serving',
 'outputs': [25.374309065093435]}