# Stream to Parquet

In [2]:
import nuclio

In [3]:
# Define function spec
%nuclio config kind = "nuclio"
%nuclio config spec.build.baseImage = "mlrun/ml-models"

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.build.baseImage to 'mlrun/ml-models:0.4.7'


In [6]:
# nuclio: start-code

In [6]:
import os
import pandas as pd
import numpy as np
import json
import datetime
import mlrun

In [6]:
def record_to_features(record):
    features = record['request']['instances'][0]
    timestamp = record['when']
    prediction = record['resp']
    
    return [timestamp] + [feature for feature in features] + prediction

In [17]:
def init_context(context):
    setattr(context, 'batch', [])
    setattr(context, 'window', int(os.getenv('window', 10)))
    
    columns = []
    features = os.getenv('features', None)
    if features is not None:
        features = features.split(',')    
        columns += features
    setattr(context, 'features', features)
        
    predictions = os.getenv('predictions', None)
    if predictions is not None:
        predictions = predictions.split(',')
        columns += predictions
        
    label_col = os.getenv('label_col', None)
    if label_col is not None:
        label_col = label_col.split(',')
        columns += label_col
    setattr(context, 'columns', ['timestamp'] + columns)
    
    setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))
    os.makedirs(context.save_to, exist_ok=True)
    
    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'
    if 'hub_url' in os.environ:
        mlrun.mlconf.hub_url = os.environ['hub_url']
    virtual_drift_fn = mlrun.import_function('hub://virtual_drift')
    virtual_drift_fn.apply(mlrun.mount_v3io(name='vfn_mount', mount_path=os.getenv('mount_path', '~/'), remote=os.getenv('mount_remote', '/User')))
    setattr(context, 'virtual_drift_fn', virtual_drift_fn)
    setattr(context, 'base_dataset', os.getenv('base_dataset', ''))
    
    setattr(context, 'label_col', label_col)
    setattr(context, 'results_tsdb_container', os.getenv('results_tsdb_container', None))
    setattr(context, 'results_tsdb_table', os.getenv('results_tsdb_table', None))

In [12]:
def handler(context, event):
    
    context.logger.info(f'Adding {event.body}')
    context.batch.append(record_to_features(json.loads(event.body)))
    
    if len(context.batch) > context.window:
        context.logger.info(context.batch)
        df = pd.DataFrame(data=context.batch,
                          columns=context.columns)
        df_path = os.path.join(context.save_to, f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq")
        df.to_parquet(df_path)

        task = mlrun.NewTask(name='drift_magnitude',
                        handler='drift_magnitude',
                        params={'label_col': context.label_col,
                                'results_tsdb_container': context.results_tsdb_container,
                                'results_tsdb_table': context.results_tsdb_table},
                        inputs={'t': context.base_dataset,
                                'u': df_path},
                        artifact_path=mlrun.mlconf.artifact_path)
        
        context.virtual_drift_fn.run(task,
                                     watch=False)
        
        context.batch = []

In [10]:
# nuclio: end-code

## Save to function yaml

In [20]:
from mlrun import mlconf, code_to_function, mount_v3io
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [21]:
# create job function object from notebook code
fn = code_to_function("stream_to_parquet")

# add metadata (for templates and reuse)
fn.spec.default_handler = "handler"
fn.spec.description = "Saves a stream to Parquet and can lunch drift detection task on it"
fn.metadata.categories = ["ml", "serve"]
fn.metadata.labels = {"author": "orz"}
fn.export("function.yaml")

> 2020-07-23 08:30:40,000 [info] function spec saved to path: function.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7f27bc7d3780>

In [30]:
fn.add_trigger('labeled_stream', nuclio.triggers.V3IOStreamTrigger(url='http://v3io-webapi:8081/bigdata/network-operations/inference_stream@vd'))

<mlrun.runtimes.function.RemoteRuntime at 0x7f42faa36588>

In [None]:
fn.set_envs({'window': 10,
             'features': 'a,b',
             'predictions': 'prediction',
             'label_col': 'is_error',
             'save_to': '/bigdata/inference_pq/',
             'base_dataset': '/bigdata/demo-network-operations/data/selected_features.parquet,',
             'results_tsdb_container': 'users',
             'results_tsdb_table': 'admin/demo_network_operations/streaming/drift_magnitude',
             'mount_remote': '/bigdata',
             'mount_path': '/bigdata}')

In [31]:
fn.apply(mount_v3io('bigdata', remote='/bigdata', mount_path='/bigdata'))

<mlrun.runtimes.function.RemoteRuntime at 0x7f42faa36588>

In [32]:
fn.apply(mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7f42faa36588>

In [33]:
fn.deploy(project='network-operations')

[mlrun] 2020-06-02 14:06:56,852 deploy started
[nuclio] 2020-06-02 14:07:16,097 (info) Build complete
[nuclio] 2020-06-02 14:07:40,163 (info) Function deploy complete
[nuclio] 2020-06-02 14:07:40,191 done updating network-operations-stream-to-parquet, function address: 192.168.224.209:31872


'http://192.168.224.209:31872'