# Pre-Processor

In [2]:
import nuclio

In [3]:
%nuclio config kind = "nuclio"
%nuclio config spec.build.baseImage = "mlrun/ml-models"

%nuclio: setting kind to 'nuclio'


In [None]:
import os

base_path = os.path.abspath('../')
data_path = os.path.join(base_path, 'data')
features_path = os.path.join(base_path, 'features')
artifacts_path = os.path.join(base_path, 'artifacts')
os.environ['data_path'] = data_path
os.environ['features_path'] = features_path
os.environ['artifacts_path'] = artifacts_path

In [1]:
# Define input and output tables
%nuclio env METRICS_TABLE = {data_path}
%nuclio env FEATURES_TABLE = {features_path}

# Base dataset link, so we can deduct relevant features
%nuclio env base_dataset = {artifacts_path}/selected_features.parquet

# Define number of batches to keep the demo running for (-1 will run forever)
%nuclio env BATCHES_TO_GENERATE = 20

In [None]:
%%nuclio env
keys = timestamp,company,data_center,device
metrics = ["cpu_utilization", "throughput", "packet_loss", "latency"]
metric_aggs = ["mean", "sum", "std", "var", "min", "max", "median"]
suffix = daily
window = 3
center = 0
inplace = 0
drop_na = 1
files_to_select = 1
label_col = is_error
is_save_to_tsdb = 0

## Function

In [6]:
# nuclio: start-code

In [7]:
import os
import pandas as pd
from mlrun.datastore import DataItem
import ast

from typing import Union
from mlrun import mlconf, import_function, mount_v3io, NewTask, function_to_module, get_or_create_ctx
from mlrun.run import get_dataitem

In [8]:
def get_data_tsdb(context):
    df = context.v3f.read(backend='tsdb', query=f'select cpu_utilization, latency, packet_loss, throughput, is_error from {context.metrics_table}',
                          start=f'now-2h', end='now', multi_index=True)
    df = format_df_from_tsdb(context, df)
    return df

In [9]:
def get_data_parquet(context):
    mpath = [os.path.join(context.metrics_table, file) for file in os.listdir(context.metrics_table) if file.endswith(('parquet', 'pq'))]
    files_by_updated = sorted(mpath, key=os.path.getmtime, reverse=True)
    context.logger.info(files_by_updated)
    latest = files_by_updated[:context.files_to_select]
    context.logger.info(f'Aggregating {latest}')
    input_df = pd.concat([pd.read_parquet(df) for df in latest])
    return input_df

In [10]:
def save_to_tsdb(context, features: pd.DataFrame):   
    context.v3f.write('tsdb', context.features_table, features)

In [11]:
def save_to_parquet(context, df: pd.DataFrame):
    print('Saving features to Parquet')
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = df.reset_index()
    df['timestamp'] = df.loc[:, 'timestamp'].astype('datetime64[ms]')
    
    # Fix indexes
    df = df.set_index(context.keys)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    filepath = os.path.join(context.features_table, filename)
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)

In [12]:
def init_context(context):
    
    mlconf.dbpath = 'http://mlrun-api:8080'
    
    # Setup aggregate function
    aggregate_fn = import_function(os.getenv('aggregate_fn_url', 'hub://aggregate'))
    mod = function_to_module(aggregate_fn)
    setattr(context, 'aggregate', mod.aggregate)
    
    ag_context = get_or_create_ctx('aggregate')
    setattr(context, 'mlrun_ctx', ag_context)
    
    # How many batches to create? (-1 will run forever)
    batches_to_generate = int(os.getenv('BATCHES_TO_GENERATE', 20))
    setattr(context, 'batches_to_generate', batches_to_generate)
    setattr(context, 'batches_generated', 0)
    
    # Set vars from env
    setattr(context, 'metrics_table', os.getenv('METRICS_TABLE', 'netops_metrics'))
    setattr(context, 'features_table', os.getenv('FEATURES_TABLE', 'netops_features'))
    setattr(context, 'keys', os.getenv('keys', '').split(','))
    setattr(context, 'metrics', ast.literal_eval(os.getenv('metrics', '')))
    setattr(context, 'metric_aggs', ast.literal_eval(os.getenv('metric_aggs', '')))
    setattr(context, 'suffix', os.getenv('suffix', '_agg'))
    setattr(context, 'window', int(os.getenv('window', '3')))
    setattr(context, 'center', bool(int(os.getenv('center', '0'))))
    setattr(context, 'inplace', bool(int(os.getenv('inplace', '0'))))
    setattr(context, 'drop_na', bool(int(os.getenv('drop_na', '1'))))
    setattr(context, 'files_to_select', int(os.getenv('files_to_select', 1)))
    
    sample_dataset = get_dataitem(os.environ['base_dataset']).as_df()
    selected_features = [col for col in list(sample_dataset.columns) if col != os.getenv('label_col', '')]
    aggregated_features = [feature.split('_')[:-1] for feature in selected_features if feature.endswith(context.suffix)]
    base_features = set([f[0] for f in aggregated_features])
    aggregations = set([f[1] for f in aggregated_features])
    setattr(context, 'features', selected_features)
    setattr(context, 'base_features', base_features)
    setattr(context, 'aggregations', aggregations)
    
    
    
    # Save to TSDB
    is_save_to_tsdb = bool(int(os.getenv('save_to_tsdb', '0')))
    if is_save_to_tsdb:
        # Create our DB client
        v3io_client = v3f.Client(address='framesd:8081', container='bigdata')
        setattr(context, 'v3f', v3io_client)
        
        # Create features table if neede
        context.v3f.create('tsdb', context.features_table, attrs={'rate': '1/s'}, if_exists=1)
        
        # Set TSDB reading function
        setattr(context, 'read', get_data_tsdb)
        
        # Set TSDB saving function
        setattr(context, 'write', save_to_tsdb)
        
    # Save to Parquet
    else:
         # Create saving directory if needed
        filepath = os.path.join(context.features_table)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
            
        # Set Parquet reading function
        setattr(context, 'read', get_data_parquet)
        
        # Set Parquet saving function
        setattr(context, 'write', save_to_parquet)

In [15]:
def handler(context, event):
    
    # Limit the number of generated batches to save cluster resources
    # for people forgetting the demo running
    if (context.batches_to_generate == -1) or (context.batches_generated <= context.batches_to_generate):
    
        # Get latest parquets
        df = context.read(context)

        # Call aggregate
        res = context.aggregate(context=context.mlrun_ctx,
                  df_artifact=df,
                  save_to=context.features_table, 
                  keys=context.keys, 
                  metrics=context.metrics, 
                  metric_aggs=context.metric_aggs, 
                  suffix=context.suffix, 
                  window=context.window, 
                  center=context.center, 
                  inplace=context.inplace,
                  drop_na=context.drop_na)

        context.logger.info(f'res.columns: {res.columns}')
        context.logger.info(f'context.columns: {context.features}')
        res = res[context.features]

        # Save
        context.write(context, res)
        
        # Update batches count
        context.batches_generated += 1

In [16]:
# nuclio: end-code

## Local test

In [223]:
init_context(context)

[mlrun] 2020-07-08 12:47:04,161 logging run results to: http://mlrun-api:8080


In [None]:
event = nuclio.Event(body='')
out = handler(context, event)
out

## Test

In [19]:
from mlrun import code_to_function, mount_v3io

In [20]:
fn = code_to_function('nuclio-preprocessor',
                      kind='nuclio',
                      project='network-operations')
fn.spec.base_spec['spec']['build']['baseImage'] = 'mlrun/ml-models'
fn.apply(mount_v3io())
fn.add_trigger('cron', nuclio.triggers.CronTrigger(interval='1m'))

<mlrun.runtimes.function.RemoteRuntime at 0x7fb7401972b0>

In [21]:
fn.save()
fn.export('../src/preprocessor.yaml')

> 2020-07-27 06:47:07,600 [debug] saving function: nuclio-preprocessor, tag: 
> 2020-07-27 06:47:07,659 [info] function spec saved to path: ../src/preprocessor.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7fb7401972b0>

In [202]:
fn.deploy(project='network-operations')

[mlrun] 2020-07-08 12:37:26,608 deploy started
[nuclio] 2020-07-08 12:37:28,726 (info) Build complete
[nuclio] 2020-07-08 12:37:40,864 (info) Function deploy complete
[nuclio] 2020-07-08 12:37:40,870 done updating network-operations-nuclio-preprocessor, function address: 192.168.224.209:31857


'http://192.168.224.209:31857'