# Pre-Processor

In [1]:
import nuclio

In [2]:
import os

base_path = os.path.abspath('../')
data_path = os.path.join(base_path, 'data')
features_path = os.path.join(base_path, 'features')
artifacts_path = os.path.join(base_path, 'artifacts')

os.environ['data_path'] = data_path
os.environ['features_path'] = features_path
os.environ['artifacts_path'] = artifacts_path
os.environ['METRICS_TABLE'] = data_path
os.environ['FEATURES_TABLE'] = features_path

os.environ['base_dataset'] = artifacts_path + '/selected_features.parquet'
os.environ['base_dataset'] = data_path + '/' +os.listdir(data_path)[-1]
os.environ['BATCHES_TO_GENERATE'] = '20'
os.environ['keys'] = 'timestamp,company,data_center,device'
os.environ['metrics'] = '["cpu_utilization", "throughput", "packet_loss", "latency"]'
os.environ['metric_aggs'] = '["mean", "sum", "std", "var", "min", "max", "median"]'
os.environ['suffix'] = 'daily'
os.environ['window'] = '3'
os.environ['center'] = '0'
os.environ['inplace'] = '0'
os.environ['drop_na'] = '1'
os.environ['files_to_select'] = '1'
os.environ['label_col'] = 'is_error'
os.environ['is_save_to_tsdb'] = '0'

## Function

In [3]:
# nuclio: start-code

In [4]:
import os
import pandas as pd
from mlrun.datastore import DataItem
import ast

from typing import Union
from mlrun import mlconf, import_function, mount_v3io, NewTask, function_to_module, get_or_create_ctx
from mlrun.run import get_dataitem

In [5]:
def get_data_tsdb(context):
    df = context.v3f.read(backend='tsdb', query=f'select cpu_utilization, latency, packet_loss, throughput, is_error from {context.metrics_table}',
                          start=f'now-2h', end='now', multi_index=True)
    df = format_df_from_tsdb(context, df)
    return df

In [6]:
def get_data_parquet(context):
    mpath = [os.path.join(context.metrics_table, file) for file in os.listdir(context.metrics_table) if file.endswith(('parquet', 'pq'))]
    files_by_updated = sorted(mpath, key=os.path.getmtime, reverse=True)
    context.logger.info(files_by_updated)
    latest = files_by_updated[:context.files_to_select]
    context.logger.info(f'Aggregating {latest}')
    input_df = pd.concat([pd.read_parquet(df) for df in latest])
    return input_df

In [7]:
def save_to_tsdb(context, features: pd.DataFrame):   
    context.v3f.write('tsdb', context.features_table, features)

In [8]:
def save_to_parquet(context, df: pd.DataFrame):
    print('Saving features to Parquet')
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = df.reset_index()
    df['timestamp'] = df.loc[:, 'timestamp'].astype('datetime64[ms]')
    
    # Fix indexes
    df = df.set_index(context.keys)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    filepath = os.path.join(context.features_table, filename)
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)

In [9]:
def init_context(context):
    
    mlconf.dbpath = 'http://mlrun-api:8080'
    
    # Setup aggregate function
    aggregate_fn = import_function(os.getenv('aggregate_fn_url', 'hub://aggregate'))
    mod = function_to_module(aggregate_fn)
    setattr(context, 'aggregate', mod.aggregate)
    
    ag_context = get_or_create_ctx('aggregate')
    setattr(context, 'mlrun_ctx', ag_context)
    
    # How many batches to create? (-1 will run forever)
    batches_to_generate = int(os.getenv('BATCHES_TO_GENERATE', 20))
    setattr(context, 'batches_to_generate', batches_to_generate)
    setattr(context, 'batches_generated', 0)
    
    # Set vars from env
    setattr(context, 'metrics_table', os.getenv('METRICS_TABLE', 'netops_metrics'))
    setattr(context, 'features_table', os.getenv('FEATURES_TABLE', 'netops_features'))
    setattr(context, 'keys', os.getenv('keys', '').split(','))
    setattr(context, 'metrics', ast.literal_eval(os.getenv('metrics', '')))
    setattr(context, 'metric_aggs', ast.literal_eval(os.getenv('metric_aggs', '')))
    setattr(context, 'suffix', os.getenv('suffix', '_agg'))
    setattr(context, 'window', int(os.getenv('window', '3')))
    setattr(context, 'center', bool(int(os.getenv('center', '0'))))
    setattr(context, 'inplace', bool(int(os.getenv('inplace', '0'))))
    setattr(context, 'drop_na', bool(int(os.getenv('drop_na', '1'))))
    setattr(context, 'files_to_select', int(os.getenv('files_to_select', 1)))
    
    sample_dataset = get_dataitem(os.environ['base_dataset']).as_df()
    selected_features = [col for col in list(sample_dataset.columns) if col != os.getenv('label_col', '')]
    aggregated_features = [feature.split('_')[:-1] for feature in selected_features if feature.endswith(context.suffix)]
    base_features = set([f[0] for f in aggregated_features])
    aggregations = set([f[1] for f in aggregated_features])
    setattr(context, 'features', selected_features)
    setattr(context, 'base_features', base_features)
    setattr(context, 'aggregations', aggregations)
    
    
    
    # Save to TSDB
    is_save_to_tsdb = bool(int(os.getenv('save_to_tsdb', '0')))
    if is_save_to_tsdb:
        # Create our DB client
        v3io_client = v3f.Client(address='framesd:8081', container='bigdata')
        setattr(context, 'v3f', v3io_client)
        
        # Create features table if neede
        context.v3f.create('tsdb', context.features_table, attrs={'rate': '1/s'}, if_exists=1)
        
        # Set TSDB reading function
        setattr(context, 'read', get_data_tsdb)
        
        # Set TSDB saving function
        setattr(context, 'write', save_to_tsdb)
        
    # Save to Parquet
    else:
         # Create saving directory if needed
        filepath = os.path.join(context.features_table)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
            
        # Set Parquet reading function
        setattr(context, 'read', get_data_parquet)
        
        # Set Parquet saving function
        setattr(context, 'write', save_to_parquet)

In [10]:
def handler(context, event):
    
    # Limit the number of generated batches to save cluster resources
    # for people forgetting the demo running
    if (context.batches_to_generate == -1) or (context.batches_generated <= context.batches_to_generate):
    
        # Get latest parquets
        df = context.read(context)

        # Call aggregate
        res = context.aggregate(context=context.mlrun_ctx,
                  df_artifact=df,
                  save_to=context.features_table, 
                  keys=context.keys, 
                  metrics=context.metrics, 
                  metric_aggs=context.metric_aggs, 
                  suffix=context.suffix, 
                  window=context.window, 
                  center=context.center, 
                  inplace=context.inplace,
                  drop_na=context.drop_na)

        context.logger.info(f'res.columns: {res.columns}')
        context.logger.info(f'context.columns: {context.features}')
        res = res[context.features]

        # Save
        context.write(context, res)
        
        # Update batches count
        context.batches_generated += 1

In [11]:
# nuclio: end-code

## Local test

In [12]:
init_context(context)

> 2021-10-03 13:54:07,937 [info] logging run results to: http://mlrun-api:8080


In [13]:
event = nuclio.Event(body='')
out = handler(context, event)
out

Python> 2021-10-03 13:54:09,057 [info] ['/User/test/demos/network-operations/data/20211003T135320-20211003T145320.parquet']
Python> 2021-10-03 13:54:09,058 [info] Aggregating ['/User/test/demos/network-operations/data/20211003T135320-20211003T145320.parquet']
> 2021-10-03 13:54:09,079 [info] Aggregating from Buffer
> 2021-10-03 13:54:09,290 [info] Logging artifact
Python> 2021-10-03 13:54:09,292 [info] res.columns: Index(['cpu_utilization', 'cpu_utilization_is_error', 'latency',
       'latency_is_error', 'packet_loss', 'packet_loss_is_error', 'throughput',
       'throughput_is_error', 'is_error', 'cpu_utilization_mean_daily',
       'cpu_utilization_sum_daily', 'cpu_utilization_std_daily',
       'cpu_utilization_var_daily', 'cpu_utilization_min_daily',
       'cpu_utilization_max_daily', 'cpu_utilization_median_daily',
       'throughput_mean_daily', 'throughput_sum_daily', 'throughput_std_daily',
       'throughput_var_daily', 'throughput_min_daily', 'throughput_max_daily',
       

## Test

In [47]:
from mlrun import code_to_function, mount_v3io
import os 
import nuclio

base_path = os.path.abspath('../')
data_path = os.path.join(base_path, 'data')
features_path = os.path.join(base_path, 'features')
artifacts_path = os.path.join(base_path, 'artifacts')

In [48]:
fn = code_to_function('nuclio-preprocessor',
                      kind='nuclio',
                      project='network-operations', image='mlrun/ml-models')

fn.apply(mount_v3io())
fn.set_envs({'data_path' : data_path,
             'features_path' : features_path,
             'artifacts_path' : artifacts_path,
             'METRICS_TABLE' : data_path,
             'FEATURES_TABLE' : features_path,
             'base_dataset' : data_path + '/' + os.listdir(data_path)[-1],
             'BATCHES_TO_GENERATE' :'20',
             'keys' : 'timestamp,company,data_center,device',
             'metrics' : '["cpu_utilization", "throughput", "packet_loss", "latency"]',
             'metric_aggs' : '["mean", "sum", "std", "var", "min", "max", "median"]',
             'suffix' : 'daily',
             'window' : '3',
             'center' : '0',
             'inplace' : '0',
             'drop_na' : '1',
             'files_to_select' : '1',
             'label_col' : 'is_error',
             'is_save_to_tsdb' : '0'})
fn.add_trigger('cron', nuclio.triggers.CronTrigger(interval='1m'))

<mlrun.runtimes.function.RemoteRuntime at 0x7f39d5f68e50>

In [49]:
fn.save()
fn.export('../src/preprocessor.yaml')

> 2021-10-03 12:36:35,548 [info] function spec saved to path: ../src/preprocessor.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7f39d5f68e50>

In [50]:
fn.deploy(project='network-operations')

> 2021-10-03 12:36:35,554 [info] Starting remote function deploy
2021-10-03 12:36:35  (info) Deploying function
2021-10-03 12:36:35  (info) Building
2021-10-03 12:36:35  (info) Staging files and preparing base images
2021-10-03 12:36:35  (info) Building processor image
2021-10-03 12:36:37  (info) Build complete
> 2021-10-03 12:36:43,268 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-network-operations-nuclio-preprocessor.default-tenant.svc.cluster.local:8080'], 'external_invocation_urls': ['default-tenant.app.dev8.lab.iguazeng.com:31499']}


'http://default-tenant.app.dev8.lab.iguazeng.com:31499'

In [51]:
fn.invoke('')

> 2021-10-03 12:36:46,507 [info] invoking function: {'method': 'GET', 'path': 'http://nuclio-network-operations-nuclio-preprocessor.default-tenant.svc.cluster.local:8080/'}


b''