# Concept Drift

In [4]:
import nuclio

In [5]:
from pprint import pprint

In [6]:
%%nuclio cmd -c
python -m pip install scikit-multiflow==0.4.1
python -m pip install v3io_frames
python -m pip install git+https://github.com/nuclio/nuclio-jupyter.git

In [7]:
# Define function spec
%nuclio config spec.build.baseImage = "mlrun/ml-models:0.4.7"

%nuclio: setting spec.build.baseImage to 'mlrun/ml-models:0.4.7'


In [5]:
# nuclio: start-code

In [1]:
import skmultiflow.drift_detection
import numpy as np
import pandas as pd
import os
from cloudpickle import dumps, load, dump

from nuclio.triggers import NuclioTrigger
from mlrun import DataItem, import_function, mlconf, MLClientCtx, mount_v3io

# For testing
import random

In [2]:
# Keep until nuclio-jupyter will be updated in the mlrun images
class V3IOStreamTrigger(NuclioTrigger):
    kind = 'v3ioStream'

    def __init__(self, url: str, seekTo: str = 'earliest',
                 partitions: list = [0], pollingIntervalMS: int = 250,
                 readBatchSize: int = 64, maxWorkers: int = 1, 
                 username: str = None, password: str = None):
        self._struct = {'kind': self.kind,
                        'url': url,
                        'attributes': {}}
        if maxWorkers:
            self._struct['maxWorkers'] = maxWorkers
        if seekTo:
            self._struct['attributes']['seekTo'] = seekTo
        if partitions:
            self._struct['attributes']['partitions'] = partitions
        if readBatchSize:
            self._struct['attributes']['readBatchSize'] = readBatchSize
        if pollingIntervalMS:
            self._struct['attributes']['pollingIntervalMs'] = pollingIntervalMS
        user = username if username else environ['V3IO_USERNAME']
        self._struct['username'] = user
        access_key = password if password else environ['V3IO_ACCESS_KEY']
        self._struct['password'] = access_key

In [3]:
def concept_drift_deployer(context: MLClientCtx, base_dataset:DataItem, 
                           input_stream:str, output_stream:str, output_tsdb:str, tsdb_batch_size:int, callbacks:list, 
                           models:list=['ddm', 'eddm', 'pagehinkley'], models_dest='models',
                           pagehinkley_threshold:float=10, ddm_warning_level:float=2, ddm_out_control_level:float=3,
                           label_col='label', prediction_col='prediction', hub_url:str=mlconf.hub_url, fn_tag:str='master'):
    '''
    
    
    '''
    # Set the streaming function
    mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'
    mlconf.hub_url = hub_url
    fn = import_function(url='hub://concept_drift_streaming')
    
    # Load test dataset
    context.logger.info('Loading base dataset')
    base_df = base_dataset.as_df()
    error_stream = np.where(base_df[prediction_col].values==base_df[label_col].values, 0, 1)
    
    # Models
    context.logger.info('Creating models')
    models = [model.strip() for model in os.getenv('models', 'pagehinkley, ddm, eddm').split(',')]
    models = {'eddm': skmultiflow.drift_detection.EDDM(),
              'pagehinkley': skmultiflow.drift_detection.PageHinkley(min_instances=len(error_stream),
                                                                     threshold=pagehinkley_threshold),
              'ddm': skmultiflow.drift_detection.DDM(min_num_instances=len(error_stream),
                                                     warning_level=ddm_warning_level,
                                                     out_control_level=ddm_out_control_level)}
    
    # Run initial dataset
    context.logger.info('Streaming data to models')
    for i in range(len(error_stream)):
        for model_name, model in models.items():
            model.add_element(error_stream[i])
            
    # Save warm models
    context.logger.info('Logging ready models')
    for name, model in models.items():
        data = dumps(model)
        model_file = f'{name}.pkl'
        context.log_model(f'{name}_concept_drift', body=data, labels=['concept_drift'],
                          model_file=model_file, model_dir=models_dest)
        fn.set_envs({f'{name}_model_path': os.path.join(context.artifact_path, models_dest, model_file)})
            
    # Deploy streaming concept drift function
    # with the warm models
    context.logger.info('Deploying Concept Drift Streaming function')
    fn.set_envs({'label_col': label_col,
                 'prediction_col': prediction_col, 
                 'drift_stream': output_stream,
                 'tsdb_table': output_tsdb,
                 'pagehinkley_threshold': pagehinkley_threshold,
                 'ddm_warning_level': ddm_warning_level,
                 'ddm_out_control': ddm_out_control_level})    
    fn.add_trigger('labeled_stream', V3IOStreamTrigger(url=input_stream))
    fn.apply(mount_v3io())
    print(fn.to_yaml())
    fn.deploy(project=context.project)

In [8]:
# nuclio: end-code

In [9]:
from mlrun import run_local, NewTask

In [13]:
task = NewTask(name='concept_drift_deployer',
        project='network-operations',
        handler=concept_drift_deployer,
        params={'models': ['ddm', 'eddm', 'pagehinkley'],
                'label_col': 'is_error',
                'prediction_col': 'yscore',
                'hub_url': '/User/functions/{name}/function.yaml',
                'output_tsdb': '/bigdata/network-operations/drift_tsdb',
                'input_stream': 'http://v3io-webapi:8081/bigdata/network-operations/inference_stream@cd2',
                'output_stream': '/bigdata/network-operations/drift_stream'},
        inputs={'base_dataset': 'store://network-operations/test_test_set_preds'},
        artifact_path='/User/demo-network-operations/artifacts/')

In [14]:
run_local(task)

[mlrun] 2020-06-02 14:09:36,252 starting run concept_drift_deployer uid=f87a18e1f6344f6da4f36f5f461dd1c1  -> http://10.192.65.32:8080
[nuclio] 2020-06-02 14:09:42,994 (info) Build complete
[nuclio] 2020-06-02 14:10:05,062 (info) Function deploy complete
[nuclio] 2020-06-02 14:10:05,104 done updating network-operations-concept-drift-streaming, function address: 192.168.224.209:30400
[mlrun] 2020-06-02 14:09:36,910 Loading base dataset
[mlrun] 2020-06-02 14:09:36,971 Creating models
[mlrun] 2020-06-02 14:09:36,972 Streaming data to models
[mlrun] 2020-06-02 14:09:37,018 Logging ready models
[mlrun] 2020-06-02 14:09:37,095 log artifact eddm_concept_drift at /User/demo-network-operations/artifacts/models/, size: 409, db: Y
[mlrun] 2020-06-02 14:09:37,174 log artifact pagehinkley_concept_drift at /User/demo-network-operations/artifacts/models/, size: 390, db: Y
[mlrun] 2020-06-02 14:09:37,273 log artifact ddm_concept_drift at /User/demo-network-operations/artifacts/models/, size: 455, db: Y

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
network-operations,...461dd1c1,0,Jun 02 14:09:36,completed,concept_drift_deployer,v3io_user=adminkind=handlerowner=adminhost=jupyter-7b6c4d46d6-8nxbv,base_dataset,"models=['ddm', 'eddm', 'pagehinkley']label_col=is_errorprediction_col=yscorehub_url=/User/functions/{name}/function.yamloutput_tsdb=/bigdata/network-operations/drift_tsdbinput_stream=http://v3io-webapi:8081/bigdata/network-operations/inference_stream@cd2output_stream=/bigdata/network-operations/drift_stream",,eddm_concept_driftpagehinkley_concept_driftddm_concept_drift


to track results use .show() or .logs() or in CLI: 
!mlrun get run f87a18e1f6344f6da4f36f5f461dd1c1 --project network-operations , !mlrun logs f87a18e1f6344f6da4f36f5f461dd1c1 --project network-operations
[mlrun] 2020-06-02 14:10:05,483 run executed, status=completed


<mlrun.model.RunObject at 0x7f32a0733550>

## Save function yaml

In [17]:
from os import path
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io, code_to_function
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [22]:
# create job function object from notebook code
fn = code_to_function("concept_drift", kind='job')

# add metadata (for templates and reuse)
fn.spec.default_handler = "handler"
fn.spec.description = "Deploy a streaming Concept Drift detector on a labeled stream"
fn.metadata.categories = ["ml", "serve"]
fn.metadata.labels = {"author": "orz", "framework": "sklearn"}
fn.export("function.yaml")

[mlrun] 2020-06-02 17:15:02,341 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f3209b9a320>

In [23]:
fn.apply(mount_v3io())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f3209b9a320>

## Stream testing

In [None]:
fn.deploy()

In [None]:
fn.run(task)

In [17]:
v3io_client = v3io.dataplane.Client()
def sum_stream(path, shard='0', seek_type='EARLIEST'):
    # seek the shard to the first record in it
    container, stream_path = split_path(path)
    shard_path = os.path.join(stream_path, shard)
    response = v3io_client.seek_shard(container=container,
                                      path=shard_path, 
                                      seek_type=seek_type)
    response.raise_for_status()

    # get records, starting from the location we got from seek
    response = v3io_client.get_records(container=container,
                                       path=shard_path, 
                                       location=response.output.location)
    response.raise_for_status()
    
    models = ['pagehinkley', 'eddm', 'ddm']
    result_record = response.output.records
    results = {}
    for model in models:
        results[f'{model}_change_detected'] = sum([json.loads(record.data)[f'{model}_drift'] for record in result_record])
        results[f'{model}_warning'] = sum([json.loads(record.data)[f'{model}_warning_zone'] for record in result_record])
    return results

In [None]:
## Test live endpoint with model_tester

In [19]:
tester = import_function('hub://model_server_tester').apply(mount_v3io())

In [3]:
cmd = code_to_function(filename='/User/functions/model_server_tester/model_server_tester.ipynb', kind='local', code_output='./tester.py')

In [31]:
import pandas as pd
table = '/v3io/bigdata/concept_drift_ex/tests/feature_change.pq'
df = pd.read_parquet(table)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,packet_loss,throughput,is_error
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,6625659405376,30,50,False
2020-03-29 19:22:10.106,Johnson-Morgan,Glenn_Port,306839395881,30,50,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,9686333640344,30,50,False
2020-03-29 19:22:10.106,Johnson-Morgan,Baker_Locks,6135824620701,30,50,False
2020-03-29 19:22:10.106,Romero-Perry,Kim_Locks,9598503476170,30,50,False


In [28]:
t = pd.read_parquet('/User/v3io/bigdata/concept_drift_ex/selected_features.parquet')

In [29]:
t['packet_loss'] = 30
t['throughput'] = 50
t.to_parquet('/v3io/bigdata/concept_drift_ex/tests/feature_change.pq')

In [None]:
# run the function locally
addr = 'http://192.168.224.209:32418'

table = '/User/v3io/bigdata/concept_drift_ex/selected_features.parquet' # Base dataset
# table = '/User/v3io/bigdata/concept_drift_ex/tests/test_set_true.pq' # All labels = True
# table = '/User/v3io/bigdata/concept_drift_ex/tests/test_set_false.pq' # All labels = False
# table = '/v3io/bigdata/concept_drift_ex/tests/feature_change.pq' # Feature change
for i in range(10):
    cmd.run(name='model_server_tester', 
            handler='model_server_tester',
            params={'addr': addr, 
                    'model': 'predictor', 
                    'label_column':'is_error'},
            inputs={'table': table})