# Concept Drift

In [2]:
import nuclio

In [3]:
from pprint import pprint

In [4]:
%%nuclio cmd -c
python -m pip install scikit-multiflow pandas v3io
python -m pip install pyarrow
python -m pip install mlrun
python -m pip install v3io_frames

In [5]:
# Define function spec
%nuclio config kind = "nuclio"
%nuclio config spec.image = "mlrun/ml-models:0.4.6"

# Add V3IO Mount
%nuclio env %v3io
%nuclio mount /User ~/

%nuclio: setting kind to 'nuclio'
%nuclio: setting spec.image to 'mlrun/ml-models:0.4.6'
mounting volume path /User as ~/


In [6]:
%%nuclio config
spec.triggers.labeled_stream.kind = "v3ioStream"
spec.triggers.labeled_stream.url = "http://v3io-webapi:8081/bigdata/migdal/inference_stream_1@cd"
spec.triggers.labeled_stream.attributes.partitions = [1]
spec.triggers.labeled_stream.attributes.pollingIntervalMs = 500
spec.triggers.labeled_stream.attributes.seekTo = "earliest"
spec.triggers.labeled_stream.attributes.readBatchSize = 64
spec.triggers.labeled_stream.maxWorkers = 1

%nuclio: setting spec.triggers.labeled_stream.kind to 'v3ioStream'
%nuclio: setting spec.triggers.labeled_stream.url to 'http://v3io-webapi:8081/bigdata/migdal/inference_stream_1@cd'
%nuclio: setting spec.triggers.labeled_stream.attributes.partitions to [1]
%nuclio: setting spec.triggers.labeled_stream.username to 'admin'
%nuclio: setting spec.triggers.labeled_stream.password to '24tango'
%nuclio: setting spec.triggers.labeled_stream.attributes.pollingIntervalMs to 500
%nuclio: setting spec.triggers.labeled_stream.attributes.seekTo to 'earliest'
%nuclio: setting spec.triggers.labeled_stream.attributes.readBatchSize to 64
%nuclio: setting spec.triggers.labeled_stream.maxWorkers to 1


In [7]:
# Streams
%nuclio env test_set = /User/migdal/pipeline/selected_features.parquet
%nuclio env test_set_label_col = is_error
%nuclio env label_col = label
%nuclio env prediction_col = prediction
%nuclio env drift_stream = /bigdata/migdal/drift_stream
%nuclio env tsdb_table = migdal/drift_tsdb

# Algorithms
%nuclio env pagehinkley_threshold = 10
%nuclio env models = pagehinkley, ddm, eddm

# Configurations
# %nuclio env callbacks = 
%nuclio env window_size = 10

%nuclio: setting 'test_set' environment variable
%nuclio: setting 'test_set_label_col' environment variable
%nuclio: setting 'label_col' environment variable
%nuclio: setting 'prediction_col' environment variable
%nuclio: setting 'drift_stream' environment variable
%nuclio: setting 'tsdb_table' environment variable
%nuclio: setting 'pagehinkley_threshold' environment variable
%nuclio: setting 'models' environment variable
%nuclio: setting 'window_size' environment variable


In [8]:
# nuclio: start-code

In [9]:
import skmultiflow.drift_detection
import numpy as np
import pandas as pd
import os
import json
import v3io.dataplane
import v3io_frames as v3f
import requests

# For testing
import random

In [10]:
def split_path(mntpath=''):
    if mntpath[0] == '/':
        mntpath = mntpath[1:]
    paths = mntpath.split('/')
    container = paths[0]
    subpath = ''
    if len(paths) > 1:
        subpath = mntpath[len(container):]
    return container, subpath


def create_stream(context, path, shards=1):
    # create a stream w/8 shards
    container, stream_path = split_path(path)
    context.logger.info(f'Creating stream in Container: {container} & Path {stream_path}')
    response = context.v3io_client.create_stream(container=container,
                                        path=stream_path, 
                                        shard_count=shards,
                                        raise_for_status=v3io.dataplane.RaiseForStatus.never)
    response.raise_for_status([409, 204])
    
    
def push_to_stream(context, stream_path, data):
    records = [{'data': json.dumps(rec)} for rec in data]
    container, stream_path = split_path(stream_path)
    response = context.v3io_client.put_records(container=container,
                                               path=stream_path, 
                                               records=records)


def construct_record(record):
    label_col = os.getenv('label_col', 'label')
    prediction_col = os.getenv('prediction_col', 'prediction')
    res = dict([(k, record[k]) for k in ['when', 'class', 'model', 'resp', 'request']])
    res['feature_vector'] = res.pop('request')['instances'][0]
    res['timestamp'] = res.pop('when')
    res['prediction'] = res['resp'][0]
    
    ## For Testing - Start
    label_chance = random.random()
    if res['prediction'] == 0:
        res['label'] = 0 if label_chance > 0.3 else 1
    else:
        res['label'] = 1 if label_chance > 0.8 else 0
    ## For Testing - End
    return res

In [11]:
def init_context(context):
    # create a v3io context object
    v3io_client = v3io.dataplane.Client()
    setattr(context, "v3io_client", v3io_client)
    
    # Setup windowing for TSDB writer
    v3f_client = v3f.Client('framesd:8081', container='bigdata')
    setattr(context, "v3f", v3f_client)
    window = []
    setattr(context, 'window', window)
    setattr(context, 'tsdb_table', os.getenv('tsdb_table', 'concept_drift_tsdb_1'))
    try:
        context.v3f.create('tsdb', context.tsdb_table, attrs={'rate': '1/s'}, if_exists=1)
    except:
        context.v3f.create('tsdb', context.tsdb_table, rate='1/s', if_exists=1)
    
    # Setup callbacks
    callbacks = [callback.strip() for callback in os.getenv('callbacks', '').split(',')]
    setattr(context, 'callbacks', callbacks)
    
    # Setup drift stream
    setattr(context, 'drift_stream', os.getenv('drift_stream', '/bigdata/drift_stream'))
    try:
        create_stream(context, context.drift_stream, int(os.getenv('drift_stream_shards', 1)))
    except:
        context.logger.info(f'{context.drift_stream} already exists')
    # Load test dataset
    base_df = pd.read_parquet(os.getenv('test_set', ''))
    base_predictions_stream = base_df.loc[:, os.getenv('test_set_label_col', 'label')].values
    
    # Models
    models = [model.strip() for model in os.getenv('models', 'pagehinkley, ddm, eddm').split(',')]
    models = {'eddm': skmultiflow.drift_detection.EDDM(),
              'pagehinkley': skmultiflow.drift_detection.PageHinkley(min_instances=len(base_predictions_stream),
                                                                     threshold=float(os.getenv('pagehinkley_threshold'))),
              'ddm': skmultiflow.drift_detection.DDM(min_num_instances=len(base_predictions_stream),
                                                     warning_level=float(os.getenv('ddm_warning_level', 2)),
                                                     out_control_level=float(os.getenv('ddm_out_control_level', 3)))}
    setattr(context, 'models', models)
    
    # Run initial dataset
    for i in range(len(base_predictions_stream)):
        for model_name, model in models.items():
            model.add_element(base_predictions_stream[i])
            
    setattr(context, 'label_col', os.getenv('label_col', 'label'))
    setattr(context, 'prediction_col', os.getenv('prediction_col', 'prediction'))
    setattr(context, 'window_size', int(os.getenv('window_size', 10)))

In [12]:
def handler(context, event):
    # Construct event
    context.logger.info(f'event: {event.body}')
    full_event = json.loads(event.body)
    record = construct_record(full_event)
    
    # Is our prediction wrong?
    is_error = record[context.label_col] != record[context.prediction_col]
    context.logger.info(f'Adding {is_error}')
    
    # Process the {is_error} element with our algorithms
    for name, model in context.models.items():
        # Add element
        results = {'timestamp': record['timestamp']}
        results['algorithm'] = name
        model.add_element(is_error)
        
        # Detect warning zone (if applicable to the algorithm)
        if hasattr(model, 'detected_warning_zone') and model.detected_warning_zone():
            context.logger.info(f'{name}\tWarning zone detected')
            results['warning_zone'] = 1
            full_event[f'{name}_warning_zone'] = 1
        else:
            results['warning_zone'] = 0
            full_event[f'{name}_warning_zone'] = 0
        
        # Detect drift
        if model.detected_change():
            context.logger.info('Change Detected')
            results['change_detected'] = 1
            full_event[f'{name}_drift'] = 1
        else:
            results['change_detected'] = 0
            full_event[f'{name}_drift'] = 0
        context.window.append(results)
    
    # Return results
    # Write to stream
    push_to_stream(context, context.drift_stream, [full_event])
    
    # Add to callbacks
    if context.callbacks != ['']:
        for callback in context.callbacks:
            requests.post(url=callback,
                          json=full_event)
    
    if (len(context.window) / len(context.models)) >= context.window_size:
        df = pd.DataFrame(context.window)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index(['timestamp', 'algorithm'])
        context.v3f.write('tsdb', context.tsdb_table, df)
        context.window = []

In [13]:
# nuclio: end-code

## Test 

In [None]:
init_context(context)
event = nuclio.Event(body=json.dumps({'prediction': 0,
                                      'when': 'now',
                                      'class': 'ClassModel', 
                                      'model': 'tester_v1', 
                                      'resp': [0], 
                                      'request': {'instances': [[1, 1.2, 3]]}}))
out = handler(context, event)
out

## Cluster

In [66]:
%nuclio deploy -n labeled-drift-deployed -p migdal

[nuclio] 2020-05-11 10:27:40,842 (info) Build complete
[nuclio] 2020-05-11 10:28:07,389 (info) Function deploy complete
[nuclio] 2020-05-11 10:28:07,434 done updating labeled-drift-deployed, function address: 192.168.224.209:31726
%nuclio: function deployed


## Stream testing

In [38]:
v3io_client = v3io.dataplane.Client()
def sum_stream(path, shard='0', seek_type='EARLIEST'):
    # seek the shard to the first record in it
    container, stream_path = split_path(path)
    shard_path = os.path.join(stream_path, shard)
    response = v3io_client.seek_shard(container=container,
                                      path=shard_path, 
                                      seek_type=seek_type)
    response.raise_for_status()

    # get records, starting from the location we got from seek
    response = v3io_client.get_records(container=container,
                                       path=shard_path, 
                                       location=response.output.location)
    response.raise_for_status()
    
    models = ['pagehinkley', 'eddm', 'ddm']
    result_record = response.output.records
    results = {}
    for model in models:
        results[f'{model}_change_detected'] = sum([json.loads(record.data)[f'{model}_drift'] for record in result_record])
        results[f'{model}_warning'] = sum([json.loads(record.data)[f'{model}_warning_zone'] for record in result_record])
    return results

In [57]:
''.split(',') == ['']

True

In [None]:
## Test live endpoint with model_tester

In [2]:
from os import path
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [24]:
tester = import_function('hub://model_server_tester').apply(mount_v3io())

In [3]:
cmd = code_to_function(filename='/User/functions/model_server_tester/model_server_tester.ipynb', kind='local', code_output='./tester.py')

In [31]:
import pandas as pd
table = '/v3io/bigdata/concept_drift_ex/tests/feature_change.pq'
df = pd.read_parquet(table)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,packet_loss,throughput,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,6625659405376,30,50,False
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,306839395881,30,50,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,9686333640344,30,50,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,6135824620701,30,50,False
2020-03-29 19:22:10.106,Romero-Perry,Kim_Locks,9598503476170,30,50,False


In [28]:
t = pd.read_parquet('/User/v3io/bigdata/concept_drift_ex/selected_features.parquet')

In [29]:
t['packet_loss'] = 30
t['throughput'] = 50
t.to_parquet('/v3io/bigdata/concept_drift_ex/tests/feature_change.pq')

In [None]:
# run the function locally
addr = 'http://192.168.224.209:32418'

table = '/User/v3io/bigdata/concept_drift_ex/selected_features.parquet' # Base dataset
# table = '/User/v3io/bigdata/concept_drift_ex/tests/test_set_true.pq' # All labels = True
# table = '/User/v3io/bigdata/concept_drift_ex/tests/test_set_false.pq' # All labels = False
# table = '/v3io/bigdata/concept_drift_ex/tests/feature_change.pq' # Feature change
for i in range(10):
    cmd.run(name='model_server_tester', 
            handler='model_server_tester',
            params={'addr': addr, 
                    'model': 'predictor', 
                    'label_column':'is_error'},
            inputs={'table': table})