# Pre-Processor

In [1]:
import nuclio

In [42]:
%nuclio config kind = "nuclio"

%nuclio: setting kind to 'nuclio'


In [177]:
%%nuclio env -c

aggregate_fn_url = /User/functions/aggregate/function.yaml
METRICS_TABLE = /User/demo-network-operations/data
FEATURES_TABLE = /User/demo-network-operations/features

keys = timestamp,company,data_center,device
metrics = cpu_utilization,latency,packet_loss,throughput
metric_aggs = mean,max
suffix = daily
window = 3
center = 0
inplace = 1
drop_na = 1

is_save_to_tsdb = 0

%nuclio: setting 'aggregate_fn_url' environment variable
%nuclio: setting 'METRICS_TABLE' environment variable
%nuclio: setting 'FEATURES_TABLE' environment variable
%nuclio: setting 'keys' environment variable
%nuclio: setting 'metrics' environment variable
%nuclio: setting 'metric_aggs' environment variable
%nuclio: setting 'suffix' environment variable
%nuclio: setting 'window' environment variable
%nuclio: setting 'center' environment variable
%nuclio: setting 'inplace' environment variable
%nuclio: setting 'drop_na' environment variable
%nuclio: setting 'is_save_to_tsdb' environment variable


In [174]:
# %%nuclio env -c

# aggregate_fn_url = hub://aggregate
# METRICS_TABLE = network-operations/metrics
# FEATURES_TABLE = network-operations/features

# keys = company,data_center,device
# metrics = timestamp, cpu_utilization,latency,packet_loss,throughput
# metric_aggs = mean,max
# suffix = daily
# window = 3
# center = 0
# inplace = 1
# drop_na = 1

# is_save_to_tsdb = 0

## Function

In [2]:
# nuclio: start-code

In [208]:
import os
import pandas as pd
from mlrun.datastore import DataItem

from typing import Union
from mlrun import mlconf, import_function, mount_v3io, NewTask, function_to_module, get_or_create_ctx

In [82]:
def get_data_tsdb(context):
    df = context.v3f.read(backend='tsdb', query=f'select cpu_utilization, latency, packet_loss, throughput, is_error from {context.metrics_table}',
                          start=f'now-2h', end='now', multi_index=True)
    df = format_df_from_tsdb(context, df)
    return df

In [83]:
def get_data_parquet(context):
    # Get parquet files
    mpath = [os.path.join(context.metrics_table, file) for file in os.listdir(context.metrics_table)]
    
    # Get latest filename
    latest = max(mpath, key=os.path.getmtime)
    
    # Load parquet
    df = pd.read_parquet(latest)
    
    # To Dask
#     df = format_df_from_tsdb(context, df)
    return df

In [84]:
def save_to_tsdb(context, features: pd.DataFrame):   
    context.v3f.write('tsdb', context.features_table, features)

In [180]:
def save_to_parquet(context, df: pd.DataFrame):
    print('Saving features to Parquet')
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = df.reset_index()
    df['timestamp'] = df.loc[:, 'timestamp'].astype('datetime64[ms]')
    
    # Fix indexes
    df = df.set_index(context.keys)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    filepath = os.path.join(context.features_table, filename)
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)

In [159]:
def init_context(context):
    
    mlconf.dbpath = 'http://mlrun-api:8080'
    
    # Setup aggregate function
    aggregate_fn = import_function(os.getenv('aggregate_fn_url', 'hub://aggregate'))
    mod = function_to_module(aggregate_fn)
    setattr(context, 'aggregate', mod.aggregate)
    
    # Set vars from env
    setattr(context, 'metrics_table', os.getenv('METRICS_TABLE', 'netops_metrics'))
    setattr(context, 'features_table', os.getenv('FEATURES_TABLE', 'netops_features'))
    setattr(context, 'keys', os.getenv('keys', '').split(','))
    setattr(context, 'metrics', os.getenv('metrics', '').split(','))
    setattr(context, 'metric_aggs', os.getenv('metric_aggs', '').split(','))
    setattr(context, 'suffix', os.getenv('suffix', '_agg'))
    setattr(context, 'window', int(os.getenv('window', '3')))
    setattr(context, 'center', bool(int(os.getenv('center', '0'))))
    setattr(context, 'inplace', bool(int(os.getenv('inplace', '0'))))
    setattr(context, 'drop_na', bool(int(os.getenv('drop_na', '1'))))
    
    aggregated_features = [feature.split('_')[:-1] for feature in selected_features if feature.endswith(suffix)]
    base_features = set([f[0] for f in aggregated_features])
    aggregations = set([f[1] for f in aggregated_features])
    base_features, aggregations
    
    
    # Save to TSDB
    is_save_to_tsdb = bool(int(os.getenv('save_to_tsdb', '0')))
    if is_save_to_tsdb:
        # Create our DB client
        v3io_client = v3f.Client(address='framesd:8081', container='bigdata')
        setattr(context, 'v3f', v3io_client)
        
        # Create features table if neede
        context.v3f.create('tsdb', context.features_table, attrs={'rate': '1/s'}, if_exists=1)
        
        # Set TSDB reading function
        setattr(context, 'read', get_data_tsdb)
        
        # Set TSDB saving function
        setattr(context, 'write', save_to_tsdb)
        
    # Save to Parquet
    else:
         # Create saving directory if needed
        filepath = os.path.join(context.features_table)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
            
        # Set Parquet reading function
        setattr(context, 'read', get_data_parquet)
        
        # Set Parquet saving function
        setattr(context, 'write', save_to_parquet)

In [171]:
def handler(context, event):
    
    # Get latest parquets
    df = context.read(context)
    
    # Call aggregate
    ag_context = get_or_create_ctx('aggregate')
    res = context.aggregate(context=ag_context,
              df_artifact=df,
              save_to=context.features_table, 
              keys=context.keys, 
              metrics=context.metrics, 
              metric_aggs=context.metric_aggs, 
              suffix=context.suffix, 
              window=context.window, 
              center=context.center, 
              inplace=context.inplace,
              drop_na=context.drop_na)
    
    # Save
    context.write(context, res)


In [146]:
# nuclio: end-code

## Local test

In [181]:
init_context(context)

In [182]:
event = nuclio.Event(body='')
out = handler(context, event)
out

[mlrun] 2020-06-14 13:23:35,924 logging run results to: http://mlrun-api:8080
[mlrun] 2020-06-14 13:23:35,945 <class 'pandas.core.frame.DataFrame'>
[mlrun] 2020-06-14 13:23:35,945 Aggregating from Buffer
Saving features to Parquet


## Test

In [187]:
from mlrun import code_to_function, mount_v3io

In [241]:
fn = code_to_function('nuclio-preprocessor',
                      kind='nuclio',
                      project='network-operations')
fn.spec.base_spec['spec']['build']['baseImage'] = 'mlrun/ml-models:0.4.10'
fn.apply(mount_v3io())
fn.add_trigger('cron', nuclio.triggers.CronTrigger(interval='1m'))

<mlrun.runtimes.function.RemoteRuntime at 0x7f35a0e9b7b8>

In [242]:
fn.save()

In [213]:
fn.deploy(project='network-operations')

[mlrun] 2020-06-15 06:29:38,112 deploy started
[nuclio] 2020-06-15 06:29:44,284 (info) Build complete
[nuclio] 2020-06-15 06:29:54,409 (info) Function deploy complete
[nuclio] 2020-06-15 06:29:54,416 done updating network-operations-pre-processor, function address: 192.168.224.209:30358


'http://192.168.224.209:30358'

In [220]:
sf = pd.read_parquet('../artifacts/selected_features.parquet')

In [227]:
selected_features = list(sf.columns)
selected_features

['cpu_utilization',
 'latency',
 'packet_loss',
 'throughput',
 'throughput_sum_daily',
 'throughput_min_daily',
 'is_error']

In [233]:
suffix = '_daily'
aggregated_features = [feature.split('_')[:-1] for feature in selected_features if feature.endswith(suffix)]
base_features = set([f[0] for f in aggregated_features])
aggregations = set([f[1] for f in aggregated_features])
base_features, aggregations

({'throughput'}, {'min', 'sum'})