# Stream to Parquet

In [1]:
import nuclio

In [39]:
%nuclio env window = 10
%nuclio env features = a,b
%nuclio env predictions = prediction
%nuclio env save_to = /bigdata/inference_pq/
%nuclio env base_dataset = /bigdata/migdal/data/selected_features.parquet

%nuclio: setting 'window' environment variable
%nuclio: setting 'features' environment variable
%nuclio: setting 'predictions' environment variable
%nuclio: setting 'save_to' environment variable


In [63]:
# Define function spec
%nuclio config kind = "nuclio"
%nuclio config spec.build.baseImage = "mlrun/ml-models:0.4.7"

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.image to 'mlrun/ml-models:0.4.6'
mounting volume path /bigdata as /bigdata


In [58]:
%%nuclio config
spec.triggers.labeled_stream.kind = "v3ioStream"
spec.triggers.labeled_stream.url = "http://v3io-webapi:8081/bigdata/network-operations/inference_stream@s2p"
spec.triggers.labeled_stream.attributes.partitions = [0]
spec.triggers.labeled_stream.username = "admin"
spec.triggers.labeled_stream.password = "24tango"
spec.triggers.labeled_stream.attributes.pollingIntervalMs = 500
spec.triggers.labeled_stream.attributes.seekTo = "earliest"
spec.triggers.labeled_stream.attributes.readBatchSize = 64
spec.triggers.labeled_stream.maxWorkers = 1

%nuclio: setting spec.triggers.labeled_stream.kind to 'v3ioStream'
%nuclio: setting spec.triggers.labeled_stream.url to 'http://v3io-webapi:8081/bigdata/migdal/inference_stream_1@group_2'
%nuclio: setting spec.triggers.labeled_stream.attributes.partitions to [1]
%nuclio: setting spec.triggers.labeled_stream.username to 'admin'
%nuclio: setting spec.triggers.labeled_stream.password to '24tango'
%nuclio: setting spec.triggers.labeled_stream.attributes.pollingIntervalMs to 500
%nuclio: setting spec.triggers.labeled_stream.attributes.seekTo to 'earliest'
%nuclio: setting spec.triggers.labeled_stream.attributes.readBatchSize to 64
%nuclio: setting spec.triggers.labeled_stream.maxWorkers to 1


In [6]:
# nuclio: start-code

In [81]:
import os
import pandas as pd
import numpy as np
import json
import datetime
import mlrun

In [71]:
def record_to_features(record):
    features = record['request']['instances'][0]
    timestamp = record['when']
    prediction = record['resp']
    
    return [timestamp] + [feature for feature in features] + prediction

In [5]:
def init_context(context):
    setattr(context, 'batch', [])
    setattr(context, 'window', int(os.getenv('window', 10)))
    
    features = os.getenv('features', None)
    if features is not None:
        features = features.split(',')    
    setattr(context, 'features', features)
        
    predictions = os.getenv('predictions', None)
    if predictions is not None:
        predictions = predictions.split(',')
    setattr(context, 'columns', ['timestamp'] + features + predictions)
    
    setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))
    os.makedirs(context.save_to, exist_ok=True)
    
    mlrun.mlconf.hub_url = '/User/functions/{name}/function.yaml'
    virtual_drift_fn = mlrun.import_function('hub://virtual_drift')
    virtual_drift_fn.apply(mlrun.mount_v3io(name='bigdata', mount_path='/bigdata', remote='/bigdata'))
    setattr(context, 'virtual_drift_fn', virtual_drift_fn)
    setattr(context, 'base_dataset', os.getenv('base_dataset', ''))

In [6]:
def handler(context, event):
    
    context.logger.info(f'Adding {event.body}')
    context.batch.append(record_to_features(json.loads(event.body)))
    
    if len(context.batch) > context.window:
        context.logger.info(context.batch)
        df = pd.DataFrame(data=context.batch,
                          columns=context.columns)
        df_path = os.path.join(context.save_to, f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq")
        df.to_parquet(df_path)

        task = mlrun.NewTask(name='drift_magnitude',
                        handler='drift_magnitude',
                        params={'label_col': 'is_error',
                                'results_tsdb_container': 'bigdata',
                                'results_tsdb_table': 'drift_magnitude'},
                        inputs={'t': '/bigdata/migdal/data/selected_features.parquet',
                                'u': df_path},
                        artifact_path=os.path.abspath('/bigdata/data'))
        
        context.virtual_drift_fn.run(task,
                                     watch=False)
        
        context.batch = []

In [10]:
# nuclio: end-code

## Testing

In [23]:
a = {"request": {"instances": [[0.33849388495276456, 271.0561056056432]]},
     "resp": [False], 
     "class": "ClassifierModel", 
     "model": "predictor", 
     "host": "migdal-server-568f7c9b4b-nfjdt", 
     "when": "2020-04-20 13:06:46.992705", 
     "microsec": 7661}
a

{'instances': [[0.33849388495276456, 271.0561056056432]],
 'resp': [False],
 'class': 'ClassifierModel',
 'model': 'predictor',
 'host': 'migdal-server-568f7c9b4b-nfjdt',
 'when': '2020-04-20 13:06:46.992705',
 'microsec': 7661}

In [36]:
record_to_features(a)

['2020-04-20 13:06:46.992705', 0.33849388495276456, 271.0561056056432, False]

In [None]:
record_to_features

## Deploy

In [13]:
%nuclio deploy -p network-operations -n s2p_v2

[nuclio] 2020-05-21 05:40:42,533 (error) Failed to deploy. Details:
Failed to create/update function


%nuclio: error: cannot deploy 


In [12]:
%nuclio show

%nuclio: notebook stream-to-parquet exported
Config:
apiVersion: nuclio.io/v1
kind: nuclio
metadata:
  annotations:
    nuclio.io/generated_by: function generated from 21-05-2020 by admin
  labels: {}
  name: stream-to-parquet
spec:
  build:
    baseImage: mlrun/ml-models:0.4.7
    commands: []
    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlcgoKaW1wb3J0IG9zCmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IG51bXB5IGFzIG5wCmltcG9ydCBqc29uCmltcG9ydCBkYXRldGltZQppbXBvcnQgbWxydW4KCmRlZiByZWNvcmRfdG9fZmVhdHVyZXMocmVjb3JkKToKICAgIGZlYXR1cmVzID0gcmVjb3JkWydyZXF1ZXN0J11bJ2luc3RhbmNlcyddWzBdCiAgICB0aW1lc3RhbXAgPSByZWNvcmRbJ3doZW4nXQogICAgcHJlZGljdGlvbiA9IHJlY29yZFsncmVzcCddCiAgICAKICAgIHJldHVybiBbdGltZXN0YW1wXSArIFtmZWF0dXJlIGZvciBmZWF0dXJlIGluIGZlYXR1cmVzXSArIHByZWRpY3Rpb24KCmRlZiBpbml0X2NvbnRleHQoY29udGV4dCk6CiAgICBzZXRhdHRyKGNvbnRleHQsICdiYXRjaCcsIFtdKQogICAgc2V0YXR0cihjb250ZXh0LCAnd2luZG93JywgaW50KG9zLmdldGVudignd2luZG93JywgMTApKSkKICAgIAogICAgZmVhdHVyZXMgPSBvcy5nZXRlbnYo

## Save to function yaml

In [1]:
from mlrun import mlconf, code_to_function
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [7]:
# create job function object from notebook code
fn = code_to_function("stream_to_parquet")

# add metadata (for templates and reuse)
fn.spec.default_handler = "handler"
fn.spec.description = "Saves a stream to Parquet and can lunch drift detection task on it"
fn.metadata.categories = ["ml", "serve"]
fn.metadata.labels = {"author": "orz"}
fn.export("function.yaml")

[mlrun] 2020-05-20 14:03:16,199 function spec saved to path: function.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7fe59807c0b8>