# Virtual Drift

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import nuclio

In [4]:
from mlrun import code_to_function, mount_v3io, run_local

In [None]:
%nuclio config kind = "job"
%nuclio config spec.base_image = "mlrun/ml-models"

In [None]:
%%nuclio cmd -c
pip install v3io_frames
pip install seaborn
pip install scikit-learn
pip install scipy

In [None]:
# %nuclio cmd -c python -m pip install v3io_frames

In [5]:
# nuclio: start-code

In [8]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import datetime

import v3io_frames as v3f

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import KBinsDiscretizer

In [7]:
def to_observations(context, t, u, key):
    # Create density
    t = t.apply(lambda row: f"{'_'.join([str(row[val]) for val in t.columns])}", axis=1).value_counts().sort_index()
    u = u.apply(lambda row: f"{'_'.join([str(row[val]) for val in u.columns])}", axis=1).value_counts().sort_index()

    # Add 0s if needed
    joined_uniques = pd.DataFrame([t, u]).T.fillna(0).sort_index()
    joined_uniques.columns = ['t', 'u']

    t_obs = joined_uniques.loc[:, 't']
    u_obs = joined_uniques.loc[:, 'u']

    t_pdf = t_obs/t_obs.sum()
    u_pdf = u_obs/u_obs.sum()

    context.log_dataset(f'{key}_t_pdf', pd.DataFrame(t_pdf), format='parquet')
    context.log_dataset(f'{key}_u_pdf', pd.DataFrame(u_pdf), format='parquet')
    return t_pdf, u_pdf

In [15]:
def tvd(t, u):
    return sum(abs(t - u)) / 2

def helinger(t, u):
    return (np.sqrt(np.sum(np.power(np.sqrt(t) - np.sqrt(u), 2))))/np.sqrt(2)

def kl_divergence(t, u):
    t_u = np.sum(np.where(t != 0, t * np.log(t / u), 0))
    u_t = np.sum(np.where(u != 0, u * np.log(u / t), 0))
    return t_u + u_t

def all_metrics(t, u):
    return tvd(t, u), helinger(t, u), kl_divergence(t, u)

In [26]:
def drift_magnitude(context, t: pd.DataFrame, u: pd.DataFrame, 
         label_col=None, prediction_col=None, 
         discretizers: dict = None, n_bins=5,
         stream_name: str = 'some_stream',
         results_tsdb_container: str = 'bigdata',
         results_tsdb_table: str = 'concept_drift/drift_magnitude'):
    
    # Setup v3io connection and TSDB table
    v3io_client = v3f.Client('framesd:8081', container=results_tsdb_container)
    try:
        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, rate='1/s')
    except:
        v3io_client.create('tsdb', results_tsdb_table, if_exists=1, attrs={'rate': '1/s'})
    
    # Get input DFs
    df_t = t.as_df()
    df_u = u.as_df()
    
    # Get feature cols
    
    drop_columns = []
    if label_col is not None:
        drop_columns.append(label_col)
    if prediction_col is not None:
        drop_columns.append(prediction_col)
    
    
    # Discretize continuous featuers
    continuous_features = df_t.select_dtypes(['float'])
    if discretizers is None:
        discretizers = {}
        for feature in continuous_features.columns:
            context.logger.info(f'Fitting discretizer for {feature}')
            # Need to train a new discretizer
            discretizer = KBinsDiscretizer(n_bins=n_bins,
                                           encode='ordinal',
                                           strategy='uniform')

            discretizer.fit(continuous_features.loc[:, feature].values.reshape(-1, 1))
            discretizers[feature] = discretizer
    os.makedirs(context.artifact_path, exist_ok=True)
    discretizers_path = os.path.abspath(f'{context.artifact_path}/discritizer.pkl')
    with open(discretizers_path, 'wb') as f:
        pickle.dump(discretizers, f)
    context.log_artifact('discritizers', target_path=discretizers_path)
    context.logger.info('Discretizing featuers')
    for feature, discretizer in discretizers.items():
        df_t[feature] = discretizer.transform(df_t.loc[:, feature].values.reshape(-1, 1))
        df_u[feature] = discretizer.transform(df_u.loc[:, feature].values.reshape(-1, 1))
        df_t[feature] = df_t[feature].astype('int')
        df_u[feature] = df_u[feature].astype('int')
    context.log_dataset('t_discrete', df_t, format='parquet')
    context.log_dataset('u_discrete', df_u, format='parquet')
    
    # Estimate probabilities 
    # P(X), P(y), P(X|y), P(y|X) for t and u
    
    context.logger.info('Compute prior metrics')
    
    results = {}
    t_prior, u_prior = to_observations(context, df_t.drop(drop_columns, axis=1), 
                                       df_u.drop(drop_columns, axis=1), 'features')
    results['prior_tvd'], results['prior_helinger'], results['prior_kld'] = all_metrics(t_prior, u_prior)
    
#     if prediciton_col is not None:
        
        
    if label_col is not None:
        context.logger.info('Compute class metrics')
        t_labels = pd.DataFrame(df_t.loc[:, label_col])
        u_labels = pd.DataFrame(df_u.loc[:, label_col])
        t_class, u_class = to_observations(context, t_labels,
                                           u_labels, 'class')
        results['class_shift_tvd'], results['class_shift_helinger'], results['class_shift_kld'] = all_metrics(t_class, u_class)
    
    for key, value in results.items():
        if value == float('inf'):
            context.logger.info(f'value: {value}')
            results[key]=10
    # Log results
    for key, result in results.items():
        context.log_result(key, round(result, 3))
        
    # Push results to TSDB
    now = pd.to_datetime(str(datetime.datetime.now()))
    now
    
    results['timestamp'] = pd.to_datetime(str((datetime.datetime.now())))
    context.logger.info(f"Timestamp: {results['timestamp']}")
    results['stream'] = stream_name
    results_df = pd.DataFrame(data=[list(results.values())],
                              columns=list(results.keys()))
    results_df = results_df.set_index(['timestamp', 'stream'])
    v3io_client.write('tsdb', results_tsdb_table, dfs=results_df)
#     context.log_dataset('results', results_df, format='pq')

In [46]:
# nuclio: end-code

# Test

In [11]:
import random

## Winde dataset

In [12]:
from sklearn.datasets import load_wine

In [9]:
from mlrun import NewTask

In [13]:
wine = load_wine()

In [414]:
df_wine = pd.DataFrame(data=wine['data'],
                       columns=wine['feature_names'])
df_wine['y'] = wine['target']
df_wine.to_parquet('data/wine_t.pq')
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,y
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [606]:
u = df_wine.sample(frac=0.5).copy()
# change_feature = [random.choice(wine['feature_names']), random.choice(wine['feature_names'])]
# u[change_feature] = 1
u.to_parquet('data/wine_u.pq')

In [10]:
fn = code_to_function().apply(mount_v3io(name='bigdata', mount_path='/bigdata', remote='/bigdata'))
print(fn.to_yaml())

kind: job
metadata:
  name: virtual-drift
  tag: ''
  project: ''
  categories: []
spec:
  command: ''
  args: []
  volumes:
  - flexVolume:
      driver: v3io/fuse
      options:
        accessKey: ad9e9902-4dd2-4104-8873-070dcc858ca6
        container: bigdata
        subPath: ''
    name: bigdata
  volume_mounts:
  - mountPath: /bigdata
    name: bigdata
  env:
  - name: V3IO_API
    value: v3io-webapi.default-tenant.svc:8081
  - name: V3IO_USERNAME
    value: admin
  - name: V3IO_ACCESS_KEY
    value: ad9e9902-4dd2-4104-8873-070dcc858ca6
  default_handler: ''
  entry_points:
    to_observations:
      name: to_observations
      doc: ''
      parameters:
      - name: context
      - name: t
      - name: u
      - name: key
      outputs: []
      lineno: 16
    tvd:
      name: tvd
      doc: ''
      parameters:
      - name: t
      - name: u
      outputs: []
      lineno: 33
    helinger:
      name: helinger
      doc: ''
      parameters:
      - name: t
      - name: u
   

In [13]:
fn.deploy()

[mlrun] 2020-05-13 07:52:52,718 starting remote build, image: .mlrun/func-default-virtual-drift-latest
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.7 to mlrun/mlrun:0.4.7 
[36mINFO[0m[0000] Resolved base name mlrun/mlrun:0.4.7 to mlrun/mlrun:0.4.7 
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0003] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0005] Built cross stage deps: map[]                
[36mINFO[0m[0005] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0006] Retrieving image manifest mlrun/mlrun:0.4.7  
[36mINFO[0m[0007] Unpacking rootfs as cmd RUN pip install v3io_frames requires it. 
[36mINFO[0m[0127] Taking snapshot of full filesystem...        
[36mINFO[0m[0130] Resolving paths                              
[36mINFO[0m[0156] RUN pip install v3io_frames                  
[36mINFO[0m[0156] cmd: /bin/sh                                 
[36mINFO[0m[0156] args: [-c pip install v3io_frames]     

True

In [11]:
task = NewTask(name='drift_magnitude',
               handler='drift_magnitude',
               params={'label_col': 'is_error',
                       'results_tsdb_container': 'bigdata',
                       'results_tsdb_table': 'drift_magnitude'},
               inputs={'t': '/bigdata/concept_drift/data/selected_features.parquet',
                       'u': '/bigdata/inference_pq/2020-05-12T11:20:33.pq'},
               artifact_path=os.path.abspath('/bigdata/wine'))

In [None]:
fn.with_code().run(task)