# Prerequisites

In [1]:
%pip install mlrun shapely



# Create an MLRun project and configuration

In [1]:
from os import path, getenv
from mlrun import new_project

project_name = '-'.join(filter(None, ['nyc-taxi', getenv('V3IO_USERNAME', None)]))
project_path = path.abspath('conf')
project = new_project(project_name, project_path, init_git=True)

print(f'Project path: {project_path}\nProject name: {project_name}')

Project path: /User/taxi/conf
Project name: nyc-taxi-edmond


In [2]:
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io

# Target location for storing pipeline artifacts
artifact_path = path.abspath('jobs')
# MLRun DB path or API service URL
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

print(f'Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}')

Artifacts path: /User/taxi/jobs
MLRun DB path: http://mlrun-api:8080


# Define Nuclio and MLRun functions

In [3]:
import nuclio

In [4]:
# nuclio: start-code

In [5]:
%nuclio cmd -c pip install lightgbm shapely

In [6]:
%nuclio config spec.build.baseImage = "mlrun/mlrun"
%nuclio config spec.image = "mlrun/ml-models"
%nuclio config kind = "job"

%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
%nuclio: setting spec.image to 'mlrun/ml-models'
%nuclio: setting kind to 'job'


In [7]:
import numpy as np 
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc
from os import path, getenv
from mlrun.run import get_dataitem
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import get_model, update_model
from pickle import dumps
import shapely.wkt

In [8]:
def get_zones_dict(zones_url):
    zones_df = pd.read_csv(zones_url)
    
    # Remove unecessary fields
    zones_df.drop(['Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough'], axis=1, inplace=True)
    
    # Convert DF to dictionary
    zones_dict = zones_df.set_index('OBJECTID').to_dict('index')
    
    # Add lat/long to each zone
    for zone in zones_dict:
        shape = shapely.wkt.loads(zones_dict[zone]['the_geom'])
        zones_dict[zone]['long'] = shape.centroid.x
        zones_dict[zone]['lat'] = shape.centroid.y
    
    return zones_dict

In [9]:
def get_zone_lat(zones_dict, zone_id):
    return zones_dict[zone_id]['lat']

In [10]:
def get_zone_long(zones_dict, zone_id):
    return zones_dict[zone_id]['long']

In [11]:
def clean_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
             (df.PULocationID > 0) & (df.PULocationID <= 263) & 
             (df.DOLocationID > 0) & (df.DOLocationID <= 263)]

In [12]:
# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [13]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)

In [14]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
    
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset

In [15]:
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['tpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

In [16]:
def fetch_data(context : MLClientCtx, taxi_records_csv_path, zones_csv_path):
    
    context.logger.info('Reading taxi records data from {}'.format(taxi_records_csv_path))
    taxi_records_dataset = taxi_records_csv_path.as_df()
    
    context.logger.info('Reading zones data from {}'.format(zones_csv_path))
    zones_dataset = zones_csv_path.as_df()
    
    target_path = path.join(context.artifact_path, 'data')
    context.logger.info('Saving datasets to {} ...'.format(target_path))

    # Store the data sets in your artifacts database
    context.log_dataset('nyc-taxi-dataset', df=taxi_records_dataset, format='csv',
                        index=False, artifact_path=target_path)
    context.log_dataset('zones-dataset', df=zones_dataset, format='csv',
                        index=False, artifact_path=target_path)    

In [17]:
def get_zones_dict(zones_df):

    # Remove unecessary fields
    zones_df.drop(['Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough'], axis=1, inplace=True)
    
    # Convert DF to dictionary
    zones_dict = zones_df.set_index('OBJECTID').to_dict('index')
    
    # Add lat/long to each zone
    for zone in zones_dict:
        shape = shapely.wkt.loads(zones_dict[zone]['the_geom'])
        zones_dict[zone]['long'] = shape.centroid.x
        zones_dict[zone]['lat'] = shape.centroid.y
    
    return zones_dict

In [18]:
def get_zone_lat(zones_dict, zone_id):
    return zones_dict[zone_id]['lat']

In [19]:
def get_zone_long(zones_dict, zone_id):
    return zones_dict[zone_id]['long']

In [20]:
def transform_dataset(context : MLClientCtx, taxi_records_csv_path: str, zones_csv_path: str):
    
    context.logger.info('Begin datasets transform')
    
    context.logger.info('zones_csv_path: ' + str(zones_csv_path))
    
    zones_df = get_dataitem(zones_csv_path).as_df()    
    
    # Get zones dictionary
    zones_dict = get_zones_dict(zones_df)
    
    train_df = get_dataitem(taxi_records_csv_path).as_df()
    
    # Clean DF
    train_df = clean_df(train_df)
    
    # Enrich DF
    train_df['pickup_latitude'] = train_df.apply(lambda x: get_zone_lat(zones_dict, x['PULocationID']), axis=1 )
    train_df['pickup_longitude'] = train_df.apply(lambda x: get_zone_long(zones_dict, x['PULocationID']), axis=1 )
    train_df['dropoff_latitude'] = train_df.apply(lambda x: get_zone_lat(zones_dict, x['DOLocationID']), axis=1 )
    train_df['dropoff_longitude'] = train_df.apply(lambda x: get_zone_long(zones_dict, x['DOLocationID']), axis=1 )

    train_df = add_datetime_info(train_df)
    train_df = add_airport_dist(train_df)

    train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
    train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
    train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
    train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])

    train_df.drop(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'congestion_surcharge', 'improvement_surcharge', 'pickup_datetime',
                  'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount', 'RatecodeID', 'store_and_fwd_flag',
                  'PULocationID', 'DOLocationID', 'payment_type'], 
                  axis=1, inplace=True, errors='ignore')
    
    # Save dataset to artifact
    target_path = path.join(context.artifact_path, 'data')
    context.log_dataset('nyc-taxi-dataset-transformed', df=train_df, artifact_path=target_path, format='csv')    
    
    context.logger.info('End dataset transform')

In [21]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

In [22]:
def train_model(context: MLClientCtx, input_ds: str):
    
    context.logger.info('Begin training')
    context.logger.info('LGBM version is ' + str(lgbm.__version__))
    
    train_df = get_dataitem(input_ds).as_df()
    
    y = train_df['fare_amount']
  
    train_df = train_df.drop(columns=['fare_amount'])
    train_df = train_df.drop(train_df.columns[[0]], axis=1)
    x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)
    
    train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
    valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
    model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)
    
    context.log_model('FareModel',
                     body=dumps(model),
                     artifact_path=context.artifact_subpath("models"),
                     model_file="FareModel.pkl")
    
    context.logger.info('End training')

In [23]:
# nuclio: end-code

## Run fetch_data locally

In [24]:
taxi_records_csv_path = 'https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv'
zones_csv_path = 'https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv'

In [25]:
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io

fetch_data_run = run_local(name='fetch_data',
                         handler=fetch_data,
                         inputs={'taxi_records_csv_path': taxi_records_csv_path,
                                 'zones_csv_path': zones_csv_path},
                         project=project_name, artifact_path=artifact_path)

> 2020-12-06 13:55:46,132 [info] starting run fetch_data uid=72142da525594806a079da34281fbaf5  -> http://mlrun-api:8080
> 2020-12-06 13:55:46,216 [info] Reading taxi records data from https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv
> 2020-12-06 13:55:54,040 [info] Reading zones data from https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv
> 2020-12-06 13:55:55,964 [info] Saving datasets to /User/taxi/jobs/data ...


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...281fbaf5,0,Dec 06 13:55:46,completed,fetch_data,v3io_user=edmondkind=handlerowner=edmondhost=jupyter-edmond-5b458fc996-fp8p9,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-datasetzones-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 72142da525594806a079da34281fbaf5 --project nyc-taxi-edmond , !mlrun logs 72142da525594806a079da34281fbaf5 --project nyc-taxi-edmond
> 2020-12-06 13:56:11,304 [info] run executed, status=completed


In [26]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://nyc-taxi-edmond/fetch_data_nyc-taxi-dataset#72142da525594806a079da34281fbaf5',
 'zones-dataset': 'store://nyc-taxi-edmond/fetch_data_zones-dataset#72142da525594806a079da34281fbaf5'}

## Run fetch_data on cluster

#### Prepare cluster function

Create an MLRun function and create a custom image for it (that uses shapely).

In [None]:
from mlrun import code_to_function

# Convert the local ingest_data_to_dataset function into a gen_func project function
gen_func = code_to_function(name='cluster-function')
project.set_function(gen_func)
cluster_func = project.func('cluster-function').apply(mount_v3io())
cluster_func.deploy()

In [28]:
fetch_data_run = cluster_func.run(name='fetch_data',
                                 handler='fetch_data',
                                 inputs={'taxi_records_csv_path': taxi_records_csv_path,
                                         'zones_csv_path': zones_csv_path},
                                 artifact_path=artifact_path)

> 2020-12-06 13:58:05,184 [info] starting run fetch_data uid=084149a790b44bfb887cb0b30bfadcc7  -> http://mlrun-api:8080
> 2020-12-06 13:58:05,325 [info] Job is running in the background, pod: fetch-data-xpdpg
> 2020-12-06 13:58:11,076 [info] Reading taxi records data from https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv
> 2020-12-06 13:58:19,243 [info] Reading zones data from https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv
> 2020-12-06 13:58:21,089 [info] Saving datasets to /User/taxi/jobs/data ...
> 2020-12-06 13:58:36,866 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...0bfadcc7,0,Dec 06 13:58:11,completed,fetch_data,v3io_user=edmondkind=jobowner=edmondhost=fetch-data-xpdpg,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-datasetzones-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 084149a790b44bfb887cb0b30bfadcc7 --project nyc-taxi-edmond , !mlrun logs 084149a790b44bfb887cb0b30bfadcc7 --project nyc-taxi-edmond
> 2020-12-06 13:58:44,762 [info] run executed, status=completed


In [29]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://nyc-taxi-edmond/fetch_data_nyc-taxi-dataset#084149a790b44bfb887cb0b30bfadcc7',
 'zones-dataset': 'store://nyc-taxi-edmond/fetch_data_zones-dataset#084149a790b44bfb887cb0b30bfadcc7'}

## Run transform_dataset

In [30]:
transform_dataset_run = cluster_func.run(name='transform_dataset',
                                 handler='transform_dataset',
                                 inputs={'taxi_records_csv_path': fetch_data_run.outputs['nyc-taxi-dataset'],
                                        'zones_csv_path': fetch_data_run.outputs['zones-dataset']},
                                 artifact_path=artifact_path)

> 2020-12-06 13:58:58,685 [info] starting run transform_dataset uid=ec01ada072f14186acfa94f6a81ff27d  -> http://mlrun-api:8080
> 2020-12-06 13:58:58,824 [info] Job is running in the background, pod: transform-dataset-x8ps9
> 2020-12-06 13:59:04,391 [info] Begin datasets transform
> 2020-12-06 13:59:04,391 [info] zones_csv_path: /User/taxi/jobs/data/zones-dataset.csv
> 2020-12-06 14:00:27,852 [info] End dataset transform
> 2020-12-06 14:00:27,915 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...a81ff27d,0,Dec 06 13:59:04,completed,transform_dataset,v3io_user=edmondkind=jobowner=edmondhost=transform-dataset-x8ps9,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-dataset-transformed


to track results use .show() or .logs() or in CLI: 
!mlrun get run ec01ada072f14186acfa94f6a81ff27d --project nyc-taxi-edmond , !mlrun logs ec01ada072f14186acfa94f6a81ff27d --project nyc-taxi-edmond
> 2020-12-06 14:00:28,578 [info] run executed, status=completed


In [31]:
transform_dataset_run.outputs

{'nyc-taxi-dataset-transformed': 'store://nyc-taxi-edmond/transform_dataset_nyc-taxi-dataset-transformed#ec01ada072f14186acfa94f6a81ff27d'}

## Train model

In [32]:
train_model_run = cluster_func.run(name='train_model',
                                    handler='train_model',
                                    inputs={'input_ds': transform_dataset_run.outputs['nyc-taxi-dataset-transformed']},
                                    artifact_path=artifact_path)

> 2020-12-06 14:00:37,466 [info] starting run train_model uid=7259a62a662d4fdb920a733903f7e55e  -> http://mlrun-api:8080
> 2020-12-06 14:00:37,685 [info] Job is running in the background, pod: train-model-krg9t
> 2020-12-06 14:00:43,209 [info] Begin training
> 2020-12-06 14:00:43,209 [info] LGBM version is 3.1.0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23961
[LightGBM] [Info] Number of data points in the train set: 879294, number of used features: 16
[LightGBM] [Info] Start training from score 12.418691
Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.01302
[1000]	valid_0's rmse: 2.98594
[1500]	valid_0's rmse: 2.98293
[2000]	valid_0's rmse: 2.98665
Early stopping, best iteration is:
[1551]	valid_0's rmse: 2.98227
> 2020-12-06 14:01:41,884 [info] End training
> 2020-12-06 14:01:41,958 [info] run executed, status=completed
final state: complet

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...03f7e55e,0,Dec 06 14:00:43,completed,train_model,v3io_user=edmondkind=jobowner=edmondhost=train-model-krg9t,input_ds,,,FareModel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 7259a62a662d4fdb920a733903f7e55e --project nyc-taxi-edmond , !mlrun logs 7259a62a662d4fdb920a733903f7e55e --project nyc-taxi-edmond
> 2020-12-06 14:01:46,370 [info] run executed, status=completed


In [33]:
train_model_run.outputs

{'FareModel': 'store://nyc-taxi-edmond/train_model_FareModel#7259a62a662d4fdb920a733903f7e55e'}

## Serving

The model serving class is in model-serving.ipynb.

In [34]:
serving = project.set_function(path.abspath('.') + '/model-serving.ipynb', name='taxi-serving').apply(mount_v3io())
serving.spec.default_class = 'LGBMModel'
serving.add_model('taxi-serving', train_model_run.outputs['FareModel'])
get_data_run = serving.deploy()

> 2020-12-06 14:02:02,235 [info] deploy started
[nuclio] 2020-12-06 14:02:04,369 (info) Build complete
[nuclio] 2020-12-06 14:02:15,550 done updating nyc-taxi-edmond-taxi-serving, function address: 192.168.224.70:30319


In [35]:
my_data = '''{"inputs":[[5.1, 3.5, 1.4, 3, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2]]}'''
serving.invoke('/v2/models/taxi-serving/predict', my_data)

{'id': '9eb008ac-5ff2-460d-b6c0-3bcdc53a10ec',
 'model_name': 'taxi-serving',
 'outputs': [25.146049914725666]}

## Kubeflow Pipeline

In [84]:
%%writefile {path.join(project_path, 'workflow.py')}
from kfp import dsl
from mlrun import mount_v3io

funcs = {}
taxi_records_csv_path = 'https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv'
zones_csv_path = 'https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv'

# init functions is used to configure function resources and local settings
def init_functions(functions: dict, project=None, secrets=None):
    for f in functions.values():
        f.apply(mount_v3io())

@dsl.pipeline(
    name="NYC Taxi Demo",
    description="Convert ML script to MLRun"
)

def kfpipeline():
    
    # build our ingestion function (container image)
    builder = funcs['cluster-function'].deploy_step(skip_deployed=True)
    
    # run the ingestion function with the new image and params
    ingest = funcs['cluster-function'].as_step(
        name="fetch_data",
        handler='fetch_data',
        image=builder.outputs['image'],
        inputs={'taxi_records_csv_path': taxi_records_csv_path,
                'zones_csv_path': zones_csv_path},
        outputs=['nyc-taxi-dataset', 'zones-dataset'])

    # Join and transform the data sets 
    transform = funcs["cluster-function"].as_step(
        name="transform_dataset",
        handler='transform_dataset',
        inputs={"taxi_records_csv_path": ingest.outputs['nyc-taxi-dataset'],
                "zones_csv_path" : ingest.outputs['zones-dataset']},
        outputs=['nyc-taxi-dataset-transformed'])

    # Train the model
    train = funcs["cluster-function"].as_step(
        name="train",
        handler="train_model",
        inputs={"input_ds" : transform.outputs['nyc-taxi-dataset-transformed']},
        outputs=['FareModel'])
    
    # Deploy the model
    deploy = funcs["taxi-serving"].deploy_step(models={"taxi-serving_v1": train.outputs['FareModel']}, tag='v2')

Overwriting /User/taxi/conf/workflow.py


In [85]:
project.set_workflow('main', 'workflow.py', embed=True)

In [86]:
project.save()

In [87]:
artifact_path = path.abspath('./pipe/{{workflow.uid}}')
run_id = project.run(
    'main',
    arguments={}, 
    artifact_path=artifact_path, 
    dirty=True, watch=True)

> 2020-12-06 13:46:27,751 [info] Pipeline run id=c1a0aebb-0862-4d93-9335-1f8bb181f292, check UI or DB for progress
> 2020-12-06 13:46:27,752 [info] waiting for pipeline run completion


uid,start,state,name,results,artifacts
...09b7cd24,Dec 06 13:49:14,completed,train,,FareModel
...69164f18,Dec 06 13:47:38,completed,transform_dataset,,nyc-taxi-dataset-transformed
...ab4a943b,Dec 06 13:46:42,completed,fetch_data,,nyc-taxi-datasetzones-dataset
