# Concept Drift Streaming

In [1]:
import nuclio

In [2]:
from pprint import pprint

In [3]:
%%nuclio cmd -c
python -m pip install scikit-multiflow==0.4.1
python -m pip install v3io_frames

In [4]:
# Define function spec
%nuclio config kind = "nuclio"
%nuclio config spec.build.baseImage = "mlrun/ml-models"

# Add V3IO Mount
# %nuclio env %v3io

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'


In [5]:
# nuclio: ignore
env = {'label_col': 'resp',
       'prediction_col': 'prediction',
       'drift_stream': '/bigdata/network-operations/drift_stream',
       'tsdb_table': 'network-operations/drift_tsdb',
       'pagehinkley_threshold': 10,
       'models': ['pagehinkley', 'ddm', 'eddm'],
       'window_size': 10}
config = {'kind': 'nuclio',
          'spec.build.baseImage': 'mlrun/ml-models'}
cmd = ['python -m pip install scikit-multiflow',
       'python -m pip install v3io_frames']
v3io = True
config = nuclio.ConfigSpec(env=env,
                           config=config,
                           cmd=cmd,
                           v3io=v3io)

In [7]:
# nuclio: start-code

In [6]:
import skmultiflow.drift_detection
import numpy as np
import pandas as pd
import os
import json
import v3io.dataplane
import v3io_frames as v3f
import requests
from cloudpickle import load

# For testing
import random

In [11]:
def split_path(mntpath=''):
    if mntpath[0] == '/':
        mntpath = mntpath[1:]
    paths = mntpath.split('/')
    container = paths[0]
    subpath = ''
    if len(paths) > 1:
        subpath = mntpath[len(container):]
    return container, subpath


def create_stream(context, path, shards=1):
    # create a stream w/8 shards
    container, stream_path = split_path(path)
    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
    response = context.v3io_client.create_stream(container=container,
                                        path=stream_path, 
                                        shard_count=shards,
                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
    response.raise_for_status([409, 204])
    
    
def push_to_stream(context, stream_path, data):
    records = [{'data': json.dumps(rec)} for rec in data]
    container, stream_path = split_path(stream_path)
    response = context.v3io_client.put_records(container=container,
                                               path=stream_path, 
                                               records=records)


def construct_record(record):
    label_col = os.getenv('label_col', 'label')
    prediction_col = os.getenv('prediction_col', 'prediction')
    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
    res['feature_vector'] = res.pop('request')['instances'][0]
    res['timestamp'] = res.pop('when')
    res['prediction'] = res['resp'][0]
    return res

In [12]:
def init_context(context):
    # create a v3io context object
    v3io_client = v3io.dataplane.Client()
    setattr(context, "v3io_client", v3io_client)
    
    # Setup windowing for TSDB writer
    v3f_client = v3f.Client('framesd:8081', container='bigdata')
    setattr(context, "v3f", v3f_client)
    window = []
    setattr(context, 'window', window)
    setattr(context, 'window_size', int(os.getenv('window_size', 10)))
    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
    try:
        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
    except Exception as e:
        context.logger.info(f'Creating context with rate= faile for {e}')
        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
    
    # Setup callbacks
    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
    setattr(context, 'callbacks', callbacks)
    
    # Setup drift stream
    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
    try:
        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
    except:
        context.logger.info(f'{context.drift_stream} already exists')
    
    # Load models
    models = {}
    model_types = ['pagehinkely', 'ddm', 'eddm']
    path_suffix = '_model_path'
    for model in model_types:
        model_env = f'{model}{path_suffix}'
        if model_env in os.environ:
            with open(os.environ[model_env], 'rb') as f:
                models[model] = load(f)
    setattr(context, 'models', models)
    
    # Columns to check
    setattr(context, 'label_col', os.getenv('label_col', 'label'))
    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))

In [13]:
def handler(context, event):
    # Construct event
    context.logger.info(f'event: {event.body}')
    full_event = json.loads(event.body)
    record = construct_record(full_event)
    
    # Is our prediction wrong?
    is_error = record[context.label_col] != record[context.prediction_col]
    context.logger.info(f'Adding {is_error}')
    
    # Process the {is_error} element with our algorithms
    for name, model in context.models.items():
        # Add element
        results = {'timestamp': record['timestamp']}
        results['algorithm'] = name
        model.add_element(is_error)
        
        # Detect warning zone (if applicable to the algorithm)
        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
            context.logger.info(f'{name}\tWarning zone detected')
            results['warning_zone'] = 1
            full_event[f'{name}_warning_zone'] = 1
        else:
            results['warning_zone'] = 0
            full_event[f'{name}_warning_zone'] = 0
        
        # Detect drift
        if model.detected_change():
            context.logger.info('Change Detected')
            results['change_detected'] = 1
            full_event[f'{name}_drift'] = 1
        else:
            results['change_detected'] = 0
            full_event[f'{name}_drift'] = 0
        context.window.append(results)
    
    # Return results
    # Write to stream
    push_to_stream(context, context.drift_stream, [full_event])
    
    # Add to callbacks
    if context.callbacks != ['']:
        for callback in context.callbacks:
            requests.post(url=callback,
                          json=full_event)
    
    if (len(context.window) / len(context.models)) >= context.window_size:
        df = pd.DataFrame(context.window)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index(['timestamp', 'algorithm'])
        context.v3f.write('tsdb', context.tsdb_table, df)
        context.window = []

In [10]:
# nuclio: end-code

## Test 

In [None]:
init_context(context)
event = nuclio.Event(body=json.dumps({'prediction': 0,
                                      'when': 'now',
                                      'class': 'ClassModel', 
                                      'model': 'tester_v1', 
                                      'resp': [0], 
                                      'request': {'instances': [[1, 1.2, 3]]}}))
out = handler(context, event)
out

## Cluster

In [None]:
%nuclio deploy -n network-operations-concept-drift -p network-operations

## Save function yaml

In [14]:
from os import path
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function, get_run_db
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [16]:
# create job function object from notebook code
fn = code_to_function("concept_drift_streaming", kind='nuclio')

# add metadata (for templates and reuse)
fn.spec.default_handler = "handler"
fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream. the nuclio part of the concept_drift function"
fn.metadata.categories = ["ml", "serve"]
fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
fn.export("/User/functions/concept_drift_streaming/function.yaml")

[mlrun] 2020-07-14 13:49:22,720 function spec saved to path: /User/functions/concept_drift_streaming/function.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7fb5a7de3e80>

In [120]:
stream_trigger = nuclio.triggers.V3IOStreamTrigger(url='/bigdata/network-operations/inference_stream@cd2')

In [121]:
fn.add_trigger('labeled_stream', stream_trigger)

<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>

In [122]:
fn.apply(mount_v3io()).with_v3io()

<mlrun.runtimes.function.RemoteRuntime at 0x7fa1dc063780>

In [None]:
fn.export("function.yaml")

## Stream testing

In [40]:
fn = import_function('./function.yaml')

In [None]:
fn.deploy(project='network-operations')