# Stream to Parquet

In [1]:
import nuclio

In [39]:
%nuclio env window = 10
%nuclio env features = a,b
%nuclio env predictions = prediction
%nuclio env save_to = /bigdata/inference_pq/
%nuclio env base_dataset = /bigdata/migdal/data/selected_features.parquet

%nuclio: setting 'window' environment variable
%nuclio: setting 'features' environment variable
%nuclio: setting 'predictions' environment variable
%nuclio: setting 'save_to' environment variable


In [63]:
# Define function spec
%nuclio config kind = "nuclio"
%nuclio config spec.image = "mlrun/ml-models:0.4.6"

# Add V3IO Mount
%nuclio env %v3io
# %nuclio mount /User ~/
%nuclio mount /bigdata /bigdata/

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.image to 'mlrun/ml-models:0.4.6'
mounting volume path /bigdata as /bigdata


In [58]:
%%nuclio config
spec.triggers.labeled_stream.kind = "v3ioStream"
spec.triggers.labeled_stream.url = "http://v3io-webapi:8081/bigdata/migdal/inference_stream_1@group_2"
spec.triggers.labeled_stream.attributes.partitions = [1]
spec.triggers.labeled_stream.username = "admin"
spec.triggers.labeled_stream.password = "24tango"
spec.triggers.labeled_stream.attributes.pollingIntervalMs = 500
spec.triggers.labeled_stream.attributes.seekTo = "earliest"
spec.triggers.labeled_stream.attributes.readBatchSize = 64
spec.triggers.labeled_stream.maxWorkers = 1

%nuclio: setting spec.triggers.labeled_stream.kind to 'v3ioStream'
%nuclio: setting spec.triggers.labeled_stream.url to 'http://v3io-webapi:8081/bigdata/migdal/inference_stream_1@group_2'
%nuclio: setting spec.triggers.labeled_stream.attributes.partitions to [1]
%nuclio: setting spec.triggers.labeled_stream.username to 'admin'
%nuclio: setting spec.triggers.labeled_stream.password to '24tango'
%nuclio: setting spec.triggers.labeled_stream.attributes.pollingIntervalMs to 500
%nuclio: setting spec.triggers.labeled_stream.attributes.seekTo to 'earliest'
%nuclio: setting spec.triggers.labeled_stream.attributes.readBatchSize to 64
%nuclio: setting spec.triggers.labeled_stream.maxWorkers to 1


In [54]:
%%nuclio cmd -c
python -m pip install numpy
python -m pip install pandas
python -m pip install pyarrow
python -m pip install mlrun

In [5]:
from mlrun import code_to_function, mount_v3io, run_local

In [6]:
# nuclio: start-code

In [81]:
import os
import pandas as pd
import numpy as np
import json
import datetime
import mlrun

In [71]:
def record_to_features(record):
    features = record['request']['instances'][0]
    timestamp = record['when']
    prediction = record['resp']
    
    return [timestamp] + [feature for feature in features] + prediction

In [69]:
def init_context(context):
    setattr(context, 'batch', [])
    setattr(context, 'window', int(os.getenv('window', 10)))
    
    features = os.getenv('features', None)
    if features is not None:
        features = features.split(',')    
    setattr(context, 'features', features)
        
    predictions = os.getenv('predictions', None)
    if predictions is not None:
        predictions = predictions.split(',')
    setattr(context, 'columns', ['timestamp'] + features + predictions)
    
    setattr(context, 'save_to', os.getenv('save_to', '/bigdata/inference_pq/'))
    os.makedirs(context.save_to, exist_ok=True)
    
    virtual_drift_fn = mlrun.import_function('/bigdata/functions/virtual_drift.yaml')
    virtual_drift_fn.apply(mlrun.mount_v3io(name='bigdata', mount_path='/bigdata', remote='/bigdata'))
    setattr(context, 'virtual_drift_fn', virtual_drift_fn)
    setattr(context, 'base_dataset', os.getenv('base_dataset', '/bigdata/migdal/data/selected_features.parquet'))

In [80]:
def handler(context, event):
    
    context.logger.info(f'Adding {event.body}')
    context.batch.append(record_to_features(json.loads(event.body)))
    
    if len(context.batch) > context.window:
        context.logger.info(context.batch)
        df = pd.DataFrame(data=context.batch,
                          columns=context.columns)
        df_path = os.path.join(context.save_to, f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}.pq")
        df.to_parquet(df_path)

        task = mlrun.NewTask(name='drift_magnitude',
                        handler='drift_magnitude',
                        params={'label_col': 'is_error',
                                'results_tsdb_container': 'bigdata',
                                'results_tsdb_table': 'drift_magnitude'},
                        inputs={'t': '/bigdata/migdal/data/selected_features.parquet',
                                'u': df_path},
                        artifact_path=os.path.abspath('/bigdata/data'))
        
        context.virtual_drift_fn.run(task,
                                     watch=False)
        
        context.batch = []

In [10]:
# nuclio: end-code

## Testing

In [23]:
a = {"request": {"instances": [[0.33849388495276456, 271.0561056056432]]},
     "resp": [False], 
     "class": "ClassifierModel", 
     "model": "predictor", 
     "host": "migdal-server-568f7c9b4b-nfjdt", 
     "when": "2020-04-20 13:06:46.992705", 
     "microsec": 7661}
a

{'instances': [[0.33849388495276456, 271.0561056056432]],
 'resp': [False],
 'class': 'ClassifierModel',
 'model': 'predictor',
 'host': 'migdal-server-568f7c9b4b-nfjdt',
 'when': '2020-04-20 13:06:46.992705',
 'microsec': 7661}

In [36]:
record_to_features(a)

['2020-04-20 13:06:46.992705', 0.33849388495276456, 271.0561056056432, False]

In [None]:
record_to_features

## Deploy

In [None]:
%nuclio deploy -p drift -n s2p_v2