# Virtual Drift

Drift magnitude metrics
       Computes drift magnitude metrics between base dataset t and dataset u.  

Metrics:
- TVD (Total Variation Distance)
- Helinger
- KL Divergence

## Environment setup

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import nuclio

In [2]:
from mlrun import code_to_function, mount_v3io, run_local

In [None]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

In [None]:
%%nuclio cmd -c
pip install v3io_frames
# pip install seaborn
pip install scikit-learn
pip install scipy

In [None]:
# %nuclio cmd -c python -m pip install v3io_frames

## Function

In [5]:
# nuclio: start-code

In [7]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import datetime

import v3io_frames as v3f

import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.preprocessing import KBinsDiscretizer

In [8]:
def to_observations(context, t, u, key):
    # Create density
    t = t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1).value_counts().sort_index()
    u = u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1).value_counts().sort_index()

    # Add 0s if needed
    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
    joined_uniques.columns = ['t', 'u']

    t_obs = joined_uniques.loc[:, 't']
    u_obs = joined_uniques.loc[:, 'u']

    t_pdf = t_obs/t_obs.sum()
    u_pdf = u_obs/u_obs.sum()

    context.log_dataset(f'{key}_t_pdf', pd.DataFrame(t_pdf), format='parquet')
    context.log_dataset(f'{key}_u_pdf', pd.DataFrame(u_pdf), format='parquet')
    return t_pdf, u_pdf

In [9]:
def tvd(t, u):
    return sum(abs(t - u)) / 2

def helinger(t, u):
    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2))))/np.sqrt(2)

def kl_divergence(t, u):
    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
    return t_u + u_t

def all_metrics(t, u):
    return tvd(t, u), helinger(t, u), kl_divergence(t, u)

In [40]:
def drift_magnitude(context, t: pd.DataFrame, u: pd.DataFrame, 
         label_col=None, prediction_col=None, 
         discretizers: dict = None, n_bins=5,
         stream_name: str = 'some_stream',
         results_tsdb_container: str = 'bigdata',
         results_tsdb_table: str = 'concept_drift/drift_magnitude'):
    """Drift magnitude metrics
       Computes drift magnitude metrics between base dataset t and dataset u.
       Metrics:
        - TVD (Total Variation Distance)
        - Helinger
        - KL Divergence
        
    :param context: MLRun context
    :param t: Base dataset for the drift metrics
    :param u: Test dataset for the drift metrics
    :param label_col: Label colum in t and u
    :param prediction_col: Predictions column in t and u
    :param discritizers: Dictionary of dicsritizers for the features if available
                         (Created automatically if not provided)
    :param n_bins: Number of bins to be used for histrogram creation from continuous variables
    :param stream_name: Output stream to push metrics to
    :param results_tsdb_container: TSDB table container to push metrics to
    :param results_tsdb_table: TSDB table to push metrics to
    """
    
    # Setup v3io connection and TSDB table
    v3io_client = v3f.Client('framesd:8081', container=results_tsdb_container)
    try:
        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, rate='1/s')
    except:
        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, attrs={'rate': '1/s'})
    
    # Get input DFs
    df_t = t.as_df()
    df_u = u.as_df()
    
    # Get feature cols
    
    drop_columns = []
    if label_col is not None:
        drop_columns.append(label_col)
    if prediction_col is not None:
        drop_columns.append(prediction_col)
    
    
    # Discretize continuous featuers
    continuous_features = df_t.select_dtypes(['float'])
    if discretizers is None:
        discretizers = {}
        for feature in continuous_features.columns:
            context.logger.info(f'Fitting discretizer for {feature}')
            # Need to train a new discretizer
            discretizer = KBinsDiscretizer(n_bins=n_bins,
                                           encode='ordinal',
                                           strategy='uniform')

            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
            discretizers[feature] = discretizer
    os.makedirs(context.artifact_path, exist_ok=True)
    discretizers_path = os.path.abspath(f'{context.artifact_path}/discritizer.pkl')
    with open(discretizers_path, 'wb') as f:
        pickle.dump(discretizers, f)
    context.log_artifact('discritizers', target_path=discretizers_path)
    context.logger.info('Discretizing featuers')
    for feature, discretizer in discretizers.items():
        df_t[feature] = discretizer.transform(df_t.loc[:, feature].values.reshape(-1, 1))
        df_u[feature] = discretizer.transform(df_u.loc[:, feature].values.reshape(-1, 1))
        df_t[feature] = df_t[feature].astype('int')
        df_u[feature] = df_u[feature].astype('int')
    context.log_dataset('t_discrete', df_t, format='parquet')
    context.log_dataset('u_discrete', df_u, format='parquet')
    
    # Estimate probabilities 
    # P(X), P(y), P(X|y), P(y|X) for t and u
    
    context.logger.info('Compute prior metrics')
    
    results = {}
    t_prior, u_prior = to_observations(context, df_t.drop(drop_columns, axis=1), 
                                       df_u.drop(drop_columns, axis=1), 'features')
    results['prior_tvd'], results['prior_helinger'], results['prior_kld'] = all_metrics(t_prior, u_prior)
    
    if prediction_col is not None:
        context.logger.info('Compute prediction metrics')
        t_predictions = pd.DataFrame(df_t.loc[:, prediction_col])
        u_predictions = pd.DataFrame(df_u.loc[:, prediction_col])
        t_class, u_class = to_observations(context, t_predictions,
                                           u_predictions, 'prediction')
        results['prediction_shift_tvd'], results['prediction_shift_helinger'], results['prediction_shift_kld'] = all_metrics(t_class, u_class)
        
    if label_col is not None:
        context.logger.info('Compute class metrics')
        t_labels = pd.DataFrame(df_t.loc[:, label_col])
        u_labels = pd.DataFrame(df_u.loc[:, label_col])
        t_class, u_class = to_observations(context, t_labels,
                                           u_labels, 'class')
        results['class_shift_tvd'], results['class_shift_helinger'], results['class_shift_kld'] = all_metrics(t_class, u_class)
    
    for key, value in results.items():
        if value == float('inf'):
            context.logger.info(f'value: {value}')
            results[key]=10
    # Log results
    for key, result in results.items():
        context.log_result(key, round(result, 3))
        
    # Push results to TSDB
    now = pd.to_datetime(str(datetime.datetime.now()))
    now
    
    results['timestamp'] = pd.to_datetime(str((datetime.datetime.now())))
    context.logger.info(f"Timestamp: {results['timestamp']}")
    results['stream'] = stream_name
    results_df = pd.DataFrame(data=[list(results.values())],
                              columns=list(results.keys()))
    results_df = results_df.set_index(['timestamp', 'stream'])
    v3io_client.write('tsdb', results_tsdb_table, dfs=results_df)
#     context.log_dataset('results', results_df, format='pq')

In [46]:
# nuclio: end-code

# Test

In [11]:
import random

## Wine dataset

In [30]:
from sklearn.datasets import load_wine

In [31]:
from mlrun import NewTask

In [32]:
wine = load_wine()

In [41]:
df_wine = pd.DataFrame(data=wine['data'],
                       columns=wine['feature_names'])
df_wine['y'] = wine['target']
df_wine['prediction'] = wine['target']
df_wine.to_parquet('data/wine_t.pq')
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,y,prediction
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0,0


In [42]:
u = df_wine.sample(frac=0.5).copy()
# change_feature = [random.choice(wine['feature_names']), random.choice(wine['feature_names'])]
# u[change_feature] = 1
u.to_parquet('data/wine_u.pq')

In [6]:
# create job function object from notebook code
fn = code_to_function("virtual_drift", 
                      kind='job', 
                      image='mlrun/ml-models')

# add metadata (for templates and reuse)
fn.spec.default_handler = "drift_magnitude"
fn.spec.description = "Compute drift magnitude between Time-Samples T and U"
fn.metadata.categories = ["ml", "serve", "concept-drift"]
fn.metadata.labels = {"author": "orz"}
fn.export("function.yaml")

[mlrun] 2020-07-14 13:58:50,586 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f23c8ab3f28>

In [None]:
fn.deploy()

In [45]:
fn.apply(mount_v3io())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9b09fb0e80>

In [46]:
task = NewTask(name='drift_magnitude',
               handler='drift_magnitude',
               params={'label_col': 'y',
                       'results_tsdb_container': 'bigdata',
                       'results_tsdb_table': 'drift_magnitude'},
               inputs={'t': '/User/functions/virtual_drift/data/wine_t.pq',
                       'u': '/User/functions/virtual_drift/data/wine_u.pq'},
               artifact_path=os.path.abspath('/User/functions/virtual_drift/artifacts'))

In [37]:
fn.with_code().run(task)

[mlrun] 2020-06-02 12:56:25,352 starting run drift_magnitude uid=a20c78ddc72e45119ac4684bc4b32876  -> http://10.192.65.32:8080
[mlrun] 2020-06-02 12:56:26,121 Job is running in the background, pod: drift-magnitude-xqb5r
[mlrun] 2020-06-02 12:56:44,171 starting local run: main.py # drift_magnitude
[mlrun] 2020-06-02 12:56:48,652 Fitting discretizer for alcohol
[mlrun] 2020-06-02 12:56:48,655 Fitting discretizer for malic_acid
[mlrun] 2020-06-02 12:56:48,657 Fitting discretizer for ash
[mlrun] 2020-06-02 12:56:48,658 Fitting discretizer for alcalinity_of_ash
[mlrun] 2020-06-02 12:56:48,660 Fitting discretizer for magnesium
[mlrun] 2020-06-02 12:56:48,662 Fitting discretizer for total_phenols
[mlrun] 2020-06-02 12:56:48,663 Fitting discretizer for flavanoids
[mlrun] 2020-06-02 12:56:48,664 Fitting discretizer for nonflavanoid_phenols
[mlrun] 2020-06-02 12:56:48,666 Fitting discretizer for proanthocyanins
[mlrun] 2020-06-02 12:56:48,668 Fitting discretizer for color_intensity
[mlrun] 2020-

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...c4b32876,0,Jun 02 12:56:48,completed,drift_magnitude,v3io_user=adminkind=jobowner=adminhost=drift-magnitude-xqb5r,tu,label_col=yresults_tsdb_container=bigdataresults_tsdb_table=drift_magnitude,prior_tvd=0.5prior_helinger=0.541prior_kld=10class_shift_tvd=0.028class_shift_helinger=0.02class_shift_kld=0.003,discritizerst_discreteu_discretefeatures_t_pdffeatures_u_pdfclass_t_pdfclass_u_pdf


to track results use .show() or .logs() or in CLI: 
!mlrun get run a20c78ddc72e45119ac4684bc4b32876  , !mlrun logs a20c78ddc72e45119ac4684bc4b32876 
[mlrun] 2020-06-02 12:56:57,590 run executed, status=completed


<mlrun.model.RunObject at 0x7f9b0d967128>