# Stream to Parquet

In [6]:
import nuclio

In [7]:
# Define function spec
%nuclio config kind = "nuclio"
%nuclio config spec.build.baseImage = "mlrun/ml-models"

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'


In [8]:
# nuclio: start-code

In [9]:
import os
import pandas as pd
import numpy as np
import json
import datetime
import mlrun
from ast import literal_eval

In [60]:
def record_to_features(context, record):
    features = record['request']['instances'][0]
    timestamp = record['when']
    
    return [timestamp] + [features[feature] for feature in context.columns if feature if feature != 'timestamp']

In [61]:
def init_context(context):
    setattr(context, 'batch', [])
    setattr(context, 'window', int(os.getenv('window', 10)))
    
    context.logger.info('Setting columns for tracking')
    columns = []
    features = os.getenv('features', None)
    if features is not None:
        if features[:1] == '[' and features[-1:] == ']':
            columns = literal_eval(features)
        else:
            features = features.split(',')    
            columns += features
    setattr(context, 'features', features)
        
    predictions = os.getenv('predictions_col', None)
    if predictions is not None:
        predictions = predictions.split(',')
        columns += predictions
        
    label_col = os.getenv('label_col', None)
    if label_col is not None:
        label_col = label_col.split(',')
        columns += label_col
    setattr(context, 'columns', ['timestamp'] + columns)
    context.logger.info(f'Tracking {columns}')
    
    setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))
    os.makedirs(context.save_to, exist_ok=True)
    
    context.logger.info('Setting drift function')
    mlrun.mlconf.dbpath = mlrun.mlconf.dbpath or 'http://mlrun-api:8080'
    if 'hub_url' in os.environ:
        mlrun.mlconf.hub_url = os.environ['hub_url']
    virtual_drift_fn = mlrun.import_function('hub://virtual_drift')
    virtual_drift_fn.apply(mlrun.mount_v3io(name='vfn_mount', mount_path=os.getenv('mount_path', '~/'), remote=os.getenv('mount_remote', '/User')))
    setattr(context, 'virtual_drift_fn', virtual_drift_fn)
    setattr(context, 'base_dataset', os.getenv('base_dataset', ''))
    
    setattr(context, 'label_col', label_col)
    setattr(context, 'results_tsdb_container', os.getenv('results_tsdb_container', None))
    setattr(context, 'results_tsdb_table', os.getenv('results_tsdb_table', None))
    context.logger.info('Successfuly assigned all function parameters')

In [91]:
def handler(context, event):
    
    context.logger.info(f'Adding {event.body}')
    event_body = event.body
    if type(event_body) == str:
        event_body = json.loads(event_body)
    context.batch.append(record_to_features(context, event_body))
    
    if len(context.batch) > context.window:
        context.logger.info(context.batch)
        df = pd.DataFrame(data=context.batch,
                          columns=context.columns)
        df_path = os.path.join(context.save_to, f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq")
        df.to_parquet(df_path)

        task = mlrun.NewTask(name='drift_magnitude',
                        handler='drift_magnitude',
                        params={'label_col': context.label_col,
                                'results_tsdb_container': context.results_tsdb_container,
                                'results_tsdb_table': context.results_tsdb_table},
                        inputs={'t': context.base_dataset,
                                'u': df_path},
                        artifact_path=mlrun.mlconf.artifact_path)
        
        context.virtual_drift_fn.run(task,
                                     watch=False)
        
        context.batch = []

In [68]:
# nuclio: end-code

In [50]:
%%nuclio env
features = ["cpu_utilization", "throughput", "packet_loss", "latency"]
label_col = is_error
predictions_col = prediction
save_to = /User/demo-network-operations/streaming/inference_pq/
base_dataset = /User/demo-network-operations/artifacts/test_set_preds.parquet
results_tsdb_container = users
results_tsdb_table = admin/demo_network_operations/streaming/drift_magnitude
mount_remote = /User
mount_path = /users/admin/

%nuclio: setting 'features' environment variable
%nuclio: setting 'label_col' environment variable
%nuclio: setting 'predictions_col' environment variable
%nuclio: setting 'save_to' environment variable
%nuclio: setting 'base_dataset' environment variable
%nuclio: setting 'results_tsdb_container' environment variable
%nuclio: setting 'results_tsdb_table' environment variable
%nuclio: setting 'mount_remote' environment variable
%nuclio: setting 'mount_path' environment variable


In [69]:
import json
init_context(context)

Python> 2020-08-06 12:24:32,203 [info] Setting columns for tracking
Python> 2020-08-06 12:24:32,204 [info] Tracking ['cpu_utilization', 'throughput', 'packet_loss', 'latency', 'prediction', 'is_error']
Python> 2020-08-06 12:24:32,205 [info] Setting drift function
Python> 2020-08-06 12:24:32,356 [info] Successfuly assigned all function parameters


In [None]:
fv = {"request": {"instances": [{"cpu_utilization": 10, 'throughput': 100, 'latency': 5, 'packet_loss': 0, 'is_error': '0', 'prediction': 0}]}, "when": '1238429382', "resp": ['1']}
print(f'sending {fv}')
for _ in range(11):
    event = nuclio.Event(body=json.dumps(fv))
    out = handler(context, event)
    out

In [None]:
import requests
requests.post(url='http://3.128.191.176:31589', json=fv)

## Save to function yaml

In [92]:
from mlrun import mlconf, code_to_function, mount_v3io
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [93]:
# create job function object from notebook code
fn = code_to_function("stream_to_parquet")

# add metadata (for templates and reuse)
fn.spec.default_handler = "handler"
fn.spec.description = "Saves a stream to Parquet and can lunch drift detection task on it"
fn.metadata.categories = ["ml", "serve"]
fn.metadata.labels = {"author": "orz"}
fn.export("function.yaml")

> 2020-08-06 12:49:27,601 [info] function spec saved to path: function.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7f99da8cdf50>

In [94]:
fn.add_trigger('labeled_stream', nuclio.triggers.V3IOStreamTrigger(url='http://v3io-webapi:8081/users/admin/demo-network-operations/streaming/labels_stream_v1@t142'))

<mlrun.runtimes.function.RemoteRuntime at 0x7f99da8cdf50>

In [95]:
fn.set_envs({'window': 10,
             'features': ["cpu_utilization", "throughput", "packet_loss", "latency"],
             'predictions': 'prediction',
             'label_col': 'is_error',
             'save_to': '/User/demo-network-operations/streaming/inference_pq/',
             'base_dataset': '/User/demo-network-operations/artifacts/test_set_preds.parquet,',
             'results_tsdb_container': 'users',
             'results_tsdb_table': 'admin/demo_network_operations/streaming/drift_magnitude',
             'mount_remote': '/User',
             'mount_path': '/users/admin/'})

<mlrun.runtimes.function.RemoteRuntime at 0x7f99da8cdf50>

In [96]:
fn.apply(mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7f99da8cdf50>

In [97]:
fn.deploy(project='network-operations')

> 2020-08-06 12:49:28,936 [info] deploy started
[nuclio] 2020-08-06 12:49:33,034 (info) Build complete
[nuclio] 2020-08-06 12:49:41,106 (info) Function deploy complete
[nuclio] 2020-08-06 12:49:41,112 done updating network-operations-stream-to-parquet, function address: 3.128.191.176:31589


'http://3.128.191.176:31589'

In [88]:
pd.read_parquet('/User/demo-network-operations/streaming/inference_pq/2020-08-06T12:24:32.pq')

Unnamed: 0,timestamp,cpu_utilization,throughput,packet_loss,latency,prediction,is_error
0,1238429382,10,100,0,5,0,0
1,1238429382,10,100,0,5,0,0
2,1238429382,10,100,0,5,0,0
3,1238429382,10,100,0,5,0,0
4,1238429382,10,100,0,5,0,0
5,1238429382,10,100,0,5,0,0
6,1238429382,10,100,0,5,0,0
7,1238429382,10,100,0,5,0,0
8,1238429382,10,100,0,5,0,0
9,1238429382,10,100,0,5,0,0
