# Refactored As Operational Pipeline (with MLRun)

In [None]:
# Install prerequisites
%pip install mlrun lightgbm shapely

## Create an MLRun project and configuration

In [None]:
from os import path
import mlrun

project_name_base = 'nyc-taxi'

project_name, artifact_path = mlrun.set_environment(project=project_name_base, user_project=True)

print(f'Project name: {project_name}')
print(f'Artifact path: {artifact_path}')

## Define Nuclio and MLRun Functions

In [31]:
# nuclio: start-code

In [32]:
from os import path
import numpy as np 
import pandas as pd
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from pickle import dumps
import shapely.wkt

In [33]:
def get_zones_dict(zones_url):
    zones_df = pd.read_csv(zones_url)
    
    # Remove unecessary fields
    zones_df.drop(['Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough'], axis=1, inplace=True)
    
    # Convert DF to dictionary
    zones_dict = zones_df.set_index('OBJECTID').to_dict('index')
    
    # Add lat/long to each zone
    for zone in zones_dict:
        shape = shapely.wkt.loads(zones_dict[zone]['the_geom'])
        zones_dict[zone]['long'] = shape.centroid.x
        zones_dict[zone]['lat'] = shape.centroid.y
    
    return zones_dict

In [34]:
def get_zone_lat(zones_dict, zone_id):
    return zones_dict[zone_id]['lat']

In [35]:
def get_zone_long(zones_dict, zone_id):
    return zones_dict[zone_id]['long']

In [36]:
def clean_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
             (df.PULocationID > 0) & (df.PULocationID <= 263) & 
             (df.DOLocationID > 0) & (df.DOLocationID <= 263)]

In [37]:
# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [38]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)

In [39]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
    
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset

In [40]:
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['tpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

In [41]:
def fetch_data(context : MLClientCtx, taxi_records_csv_path: DataItem, zones_csv_path: DataItem):
    
    context.logger.info('Reading taxi records data from {}'.format(taxi_records_csv_path))
    taxi_records_dataset = taxi_records_csv_path.as_df()
    
    context.logger.info('Reading zones data from {}'.format(zones_csv_path))
    zones_dataset = zones_csv_path.as_df()
    
    target_path = path.join(context.artifact_path, 'data')
    context.logger.info('Saving datasets to {} ...'.format(target_path))

    # Store the data sets in your artifacts database
    context.log_dataset('nyc-taxi-dataset', df=taxi_records_dataset, format='csv',
                        index=False, artifact_path=target_path)
    context.log_dataset('zones-dataset', df=zones_dataset, format='csv',
                        index=False, artifact_path=target_path)    

In [42]:
def get_zones_dict(zones_df):

    # Remove unecessary fields
    zones_df.drop(['Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough'], axis=1, inplace=True)
    
    # Convert DF to dictionary
    zones_dict = zones_df.set_index('OBJECTID').to_dict('index')
    
    # Add lat/long to each zone
    for zone in zones_dict:
        shape = shapely.wkt.loads(zones_dict[zone]['the_geom'])
        zones_dict[zone]['long'] = shape.centroid.x
        zones_dict[zone]['lat'] = shape.centroid.y
    
    return zones_dict

In [43]:
def get_zone_lat(zones_dict, zone_id):
    return zones_dict[zone_id]['lat']

In [44]:
def get_zone_long(zones_dict, zone_id):
    return zones_dict[zone_id]['long']

In [45]:
def transform_dataset(context : MLClientCtx, taxi_records_csv_path: DataItem, zones_csv_path: DataItem):
    
    context.logger.info('Begin datasets transform')
    
    context.logger.info('zones_csv_path: ' + str(zones_csv_path))
    
    zones_df = zones_csv_path.as_df()    
    
    # Get zones dictionary
    zones_dict = get_zones_dict(zones_df)
    
    train_df = taxi_records_csv_path.as_df()
    
    # Clean DF
    train_df = clean_df(train_df)
    
    # Enrich DF
    train_df['pickup_latitude'] = train_df.apply(lambda x: get_zone_lat(zones_dict, x['PULocationID']), axis=1 )
    train_df['pickup_longitude'] = train_df.apply(lambda x: get_zone_long(zones_dict, x['PULocationID']), axis=1 )
    train_df['dropoff_latitude'] = train_df.apply(lambda x: get_zone_lat(zones_dict, x['DOLocationID']), axis=1 )
    train_df['dropoff_longitude'] = train_df.apply(lambda x: get_zone_long(zones_dict, x['DOLocationID']), axis=1 )

    train_df = add_datetime_info(train_df)
    train_df = add_airport_dist(train_df)

    train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
    train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
    train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
    train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])

    train_df.drop(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'congestion_surcharge', 'improvement_surcharge', 'pickup_datetime',
                  'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount', 'RatecodeID', 'store_and_fwd_flag',
                  'PULocationID', 'DOLocationID', 'payment_type'], 
                  axis=1, inplace=True, errors='ignore')
    
    # Save dataset to artifact
    target_path = path.join(context.artifact_path, 'data')
    context.log_dataset('nyc-taxi-dataset-transformed', df=train_df, artifact_path=target_path, format='csv')    
    
    context.logger.info('End dataset transform')

In [46]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

In [47]:
def train_model(context: MLClientCtx, input_ds: DataItem):
    
    context.logger.info('Begin training')
    context.logger.info('LGBM version is ' + str(lgbm.__version__))
    
    train_df = input_ds.as_df()
    
    y = train_df['fare_amount']
  
    train_df = train_df.drop(columns=['fare_amount'])
    train_df = train_df.drop(train_df.columns[[0]], axis=1)
    x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)
    
    train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
    valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
    model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)
    
    context.log_model('FareModel',
                     body=dumps(model),
                     artifact_path=context.artifact_subpath("models"),
                     model_file="FareModel.pkl")
    
    context.logger.info('End training')

In [48]:
# nuclio: end-code

## Set Input Paths

In [49]:
import os

If you would like to change the location of the source data, set the `SAMPLE_DATA_SOURCE_URL_PREFIX` environment variable.

For example, set it to `/v3io/projects/demos-data/iguazio/`

In [51]:
# Set the source-data URL
url_prefix = os.environ.get('SAMPLE_DATA_SOURCE_URL_PREFIX', 'https://s3.wasabisys.com/iguazio/')

In [53]:
taxi_records_csv_path = f'{url_prefix.rstrip("/")}/data/Taxi/yellow_tripdata_2019-01_subset.csv'
zones_csv_path = f'{url_prefix.rstrip("/")}/data/Taxi/taxi_zones.csv'

## Convert Code to a Function

In [54]:
taxi_func = mlrun.code_to_function(name='taxi',
                                   kind='job',
                                   image='mlrun/mlrun',
                                   requirements=['lightgbm', 'shapely'])

## Run `fetch_data` Locally

We can test out code locally, by calling the function with `local` parameter set to `True`

In [55]:
fetch_data_run = taxi_func.run(handler='fetch_data',
                               inputs={'taxi_records_csv_path': taxi_records_csv_path,
                                       'zones_csv_path': zones_csv_path},
                               local=True)

> 2021-05-27 11:09:54,522 [info] starting run taxi-fetch_data uid=671de4a7ab274a56ba8343d791039d98 DB=http://mlrun-api:8080
> 2021-05-27 11:09:54,712 [info] Reading taxi records data from /v3io/bigdata/demos-data/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv
> 2021-05-27 11:09:56,673 [info] Reading zones data from /v3io/bigdata/demos-data/iguazio/data/Taxi/taxi_zones.csv
> 2021-05-27 11:09:56,721 [info] Saving datasets to v3io:///projects/nyc-taxi-iguazio/artifacts/data ...


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-iguazio,...91039d98,0,May 27 11:09:54,completed,taxi-fetch_data,v3io_user=iguaziokind=owner=iguaziohost=jupyter-iguazio-b6785fb-w4gcp,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-datasetzones-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 671de4a7ab274a56ba8343d791039d98 --project nyc-taxi-iguazio , !mlrun logs 671de4a7ab274a56ba8343d791039d98 --project nyc-taxi-iguazio
> 2021-05-27 11:10:04,495 [info] run executed, status=completed


In [56]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://artifacts/nyc-taxi-iguazio/taxi-fetch_data_nyc-taxi-dataset:671de4a7ab274a56ba8343d791039d98',
 'zones-dataset': 'store://artifacts/nyc-taxi-iguazio/taxi-fetch_data_zones-dataset:671de4a7ab274a56ba8343d791039d98'}

## Run on the Cluster

### Prepare Cluster Function

Create an MLRun function and create a custom image for it (that uses shapely).

In [57]:
from mlrun.platforms import auto_mount
taxi_func.apply(auto_mount())
taxi_func.deploy()

> 2021-05-27 11:10:10,328 [info] starting remote build, image: .mlrun/func-nyc-taxi-iguazio-taxi:latest
E0527 11:10:32.931190       1 aws_credentials.go:77] while getting AWS credentials NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors
[36mINFO[0m[0020] Retrieving image manifest mlrun/mlrun:0.6.3  
[36mINFO[0m[0020] Retrieving image manifest mlrun/mlrun:0.6.3  
[36mINFO[0m[0021] Built cross stage deps: map[]                
[36mINFO[0m[0021] Retrieving image manifest mlrun/mlrun:0.6.3  
[36mINFO[0m[0021] Retrieving image manifest mlrun/mlrun:0.6.3  
[36mINFO[0m[0021] Executing 0 build triggers                   
[36mINFO[0m[0021] Unpacking rootfs as cmd RUN python -m pip install lightgbm shapely requires it. 
[36mINFO[0m[0036] RUN python -m pip install lightgbm shapely   
[36mINFO[0m[0036] Taking snapshot of full filesystem...        
[36mINFO[0m[0048] cmd: /bin/sh                    

True

In [58]:
fetch_data_run = taxi_func.run(name='fetch_data',
                               handler='fetch_data',
                               inputs={'taxi_records_csv_path': taxi_records_csv_path,
                                       'zones_csv_path': zones_csv_path})

> 2021-05-27 11:11:10,744 [info] starting run fetch_data uid=0048ad3cd28240d09d717368f81bf914 DB=http://mlrun-api:8080
> 2021-05-27 11:11:10,926 [info] Job is running in the background, pod: fetch-data-f9brw
> 2021-05-27 11:11:16,790 [info] Reading taxi records data from /v3io/bigdata/demos-data/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv
> 2021-05-27 11:11:18,901 [info] Reading zones data from /v3io/bigdata/demos-data/iguazio/data/Taxi/taxi_zones.csv
> 2021-05-27 11:11:18,959 [info] Saving datasets to v3io:///projects/nyc-taxi-iguazio/artifacts/data ...
> 2021-05-27 11:11:26,658 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-iguazio,...f81bf914,0,May 27 11:11:16,completed,fetch_data,v3io_user=iguaziokind=jobowner=iguaziohost=fetch-data-f9brw,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-datasetzones-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 0048ad3cd28240d09d717368f81bf914 --project nyc-taxi-iguazio , !mlrun logs 0048ad3cd28240d09d717368f81bf914 --project nyc-taxi-iguazio
> 2021-05-27 11:11:29,203 [info] run executed, status=completed


In [59]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://artifacts/nyc-taxi-iguazio/fetch_data_nyc-taxi-dataset:0048ad3cd28240d09d717368f81bf914',
 'zones-dataset': 'store://artifacts/nyc-taxi-iguazio/fetch_data_zones-dataset:0048ad3cd28240d09d717368f81bf914'}

## Transform the Dataset

In [60]:
transform_dataset_run = taxi_func.run(name='transform_dataset',
                                      handler='transform_dataset',
                                      inputs={'taxi_records_csv_path': fetch_data_run.outputs['nyc-taxi-dataset'],
                                              'zones_csv_path': fetch_data_run.outputs['zones-dataset']})

> 2021-05-27 11:11:30,984 [info] starting run transform_dataset uid=8fef1948f92946ccbf6b2b2ce4779edc DB=http://mlrun-api:8080
> 2021-05-27 11:11:31,179 [info] Job is running in the background, pod: transform-dataset-pkh2d
> 2021-05-27 11:11:36,587 [info] Begin datasets transform
> 2021-05-27 11:11:36,587 [info] zones_csv_path: v3io:///projects/nyc-taxi-iguazio/artifacts/data/zones-dataset.csv
> 2021-05-27 11:12:29,102 [info] End dataset transform
> 2021-05-27 11:12:29,141 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-iguazio,...e4779edc,0,May 27 11:11:36,completed,transform_dataset,v3io_user=iguaziokind=jobowner=iguaziohost=transform-dataset-pkh2d,taxi_records_csv_pathzones_csv_path,,,nyc-taxi-dataset-transformed


to track results use .show() or .logs() or in CLI: 
!mlrun get run 8fef1948f92946ccbf6b2b2ce4779edc --project nyc-taxi-iguazio , !mlrun logs 8fef1948f92946ccbf6b2b2ce4779edc --project nyc-taxi-iguazio
> 2021-05-27 11:12:36,695 [info] run executed, status=completed


In [61]:
transform_dataset_run.outputs

{'nyc-taxi-dataset-transformed': 'store://artifacts/nyc-taxi-iguazio/transform_dataset_nyc-taxi-dataset-transformed:8fef1948f92946ccbf6b2b2ce4779edc'}

## Train Model

In [62]:
train_model_run = taxi_func.run(name='train_model',
                                handler='train_model',
                                inputs={'input_ds': transform_dataset_run.outputs['nyc-taxi-dataset-transformed']})

> 2021-05-27 11:12:41,075 [info] starting run train_model uid=1e3e08e3f9d24d35980947d1796b4309 DB=http://mlrun-api:8080
> 2021-05-27 11:12:41,272 [info] Job is running in the background, pod: train-model-cp7d2
> 2021-05-27 11:12:47,007 [info] Begin training
> 2021-05-27 11:12:47,007 [info] LGBM version is 3.2.1
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23961
[LightGBM] [Info] Number of data points in the train set: 879294, number of used features: 16
[LightGBM] [Info] Start training from score 12.418691
Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.01302
[1000]	valid_0's rmse: 2.98594
[1500]	valid_0's rmse: 2.98293
[2000]	valid_0's rmse: 2.98666
Early stopping, best iteration is:
[1551]	valid_0's rmse: 2.98227
> 2021-05-27 11:13:28,994 [info] End training
> 2021-05-27 11:13:29,048 [info] run executed, status=completed
Found `num_rounds` in

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-iguazio,...796b4309,0,May 27 11:12:46,completed,train_model,v3io_user=iguaziokind=jobowner=iguaziohost=train-model-cp7d2,input_ds,,,FareModel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 1e3e08e3f9d24d35980947d1796b4309 --project nyc-taxi-iguazio , !mlrun logs 1e3e08e3f9d24d35980947d1796b4309 --project nyc-taxi-iguazio
> 2021-05-27 11:13:32,748 [info] run executed, status=completed


In [63]:
train_model_run.outputs

{'FareModel': 'store://artifacts/nyc-taxi-iguazio/train_model_FareModel:1e3e08e3f9d24d35980947d1796b4309'}

## Serving

The model serving class is in model-serving.ipynb.

In [64]:
serving = mlrun.code_to_function(filename=path.abspath('model-serving.ipynb')).apply(auto_mount())

serving.spec.default_class = 'LGBMModel'
serving.add_model('taxi-serving', train_model_run.outputs['FareModel'])
serving_address = serving.deploy()

> 2021-05-27 11:13:54,077 [info] Starting remote function deploy
2021-05-27 11:13:54  (info) Deploying function
2021-05-27 11:13:54  (info) Building
2021-05-27 11:13:54  (info) Staging files and preparing base images
2021-05-27 11:13:54  (info) Building processor image
2021-05-27 11:13:55  (info) Build complete
2021-05-27 11:14:03  (info) Function deploy complete
> 2021-05-27 11:14:04,031 [info] function deployed, address=default-tenant.app.achikar-dev.iguazio-cd1.com:30817


In [65]:
my_data = '''{"inputs":[[1,0.80,0.711950,-1.291073,0.712059,1.290988,13,1,1,1,2019,47.274013,40.386065,16.975747,26.587155,18.925788]]}'''
serving.invoke('/v2/models/taxi-serving/predict', my_data)

{'id': 'bee77ccc-9e51-4192-b165-8cb208371bfc',
 'model_name': 'taxi-serving',
 'outputs': [9.52302976897415]}

## Kubeflow Pipeline

### Create Project Object

In [66]:
project_path = path.abspath('conf')
project = mlrun.new_project(project_name_base,
                            context=project_path,
                            init_git=True,
                            user_project=True)

project.set_function(f'db://{project.name}/taxi')
project.set_function(f'db://{project.name}/model-serving')

<mlrun.runtimes.serving.ServingRuntime at 0x7f133c1da2d0>

### Create the Workflow

In [67]:
%%writefile {path.join(project_path, 'workflow.py')}
from kfp import dsl
from mlrun.platforms import auto_mount
import os

funcs = {}

os.environ['SAMPLE_DATA_SOURCE_URL_PREFIX'] = '/v3io/bigdata/demos-data/iguazio/'
url_prefix = os.environ.get('SAMPLE_DATA_SOURCE_URL_PREFIX', 'https://s3.wasabisys.com/iguazio/')

taxi_records_csv_path = f'{url_prefix}data/Taxi/yellow_tripdata_2019-01_subset.csv'
zones_csv_path = f'{url_prefix}data/Taxi/taxi_zones.csv'

# init functions is used to configure function resources and local settings
def init_functions(functions: dict, project=None, secrets=None):
    for f in functions.values():
        f.apply(auto_mount())

@dsl.pipeline(
    name="NYC Taxi Demo",
    description="Convert ML script to MLRun"
)

def kfpipeline():
    
    # build our ingestion function (container image)
    builder = funcs['taxi'].deploy_step(skip_deployed=True)
    
    # run the ingestion function with the new image and params
    ingest = funcs['taxi'].as_step(
        name="fetch_data",
        handler='fetch_data',
        image=builder.outputs['image'],
        inputs={'taxi_records_csv_path': taxi_records_csv_path,
                'zones_csv_path': zones_csv_path},
        outputs=['nyc-taxi-dataset', 'zones-dataset'])

    # Join and transform the data sets 
    transform = funcs["taxi"].as_step(
        name="transform_dataset",
        handler='transform_dataset',
        inputs={"taxi_records_csv_path": ingest.outputs['nyc-taxi-dataset'],
                "zones_csv_path" : ingest.outputs['zones-dataset']},
        outputs=['nyc-taxi-dataset-transformed'])

    # Train the model
    train = funcs["taxi"].as_step(
        name="train",
        handler="train_model",
        inputs={"input_ds" : transform.outputs['nyc-taxi-dataset-transformed']},
        outputs=['FareModel'])
    
    # Deploy the model
    deploy = funcs["model-serving"].deploy_step(models={"taxi-serving_v1": train.outputs['FareModel']}, tag='v2')

Writing /User/demos/howto/converting-to-mlrun/conf/workflow.py


In [68]:
project.set_workflow('main', 'workflow.py', embed=True)

In [69]:
project.save()

### Run the Workflow

In [70]:
artifact_path = path.abspath('./pipe/{{workflow.uid}}')
run_id = project.run(
    'main',
    arguments={}, 
    artifact_path=artifact_path, 
    dirty=True,
    watch=True)

> 2021-05-27 11:14:23,879 [info] using in-cluster config.


> 2021-05-27 11:14:24,412 [info] Pipeline run id=dddd739a-28fa-42d7-88cd-ba918bda0f65, check UI or DB for progress
> 2021-05-27 11:14:24,414 [info] waiting for pipeline run completion


uid,start,state,name,results,artifacts
...2abe98c4,May 27 11:16:35,completed,train,,FareModel
...de6901ba,May 27 11:15:14,completed,transform_dataset,,nyc-taxi-dataset-transformed
...f259deb8,May 27 11:14:39,completed,fetch_data,,nyc-taxi-datasetzones-dataset
