# Network Operations
## Pre-Processing

In [1]:
# nuclio: ignore
import nuclio

Define the MLRun environment

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


## Function

In [4]:
import os
import pandas as pd
from mlrun.datastore import DataItem

from typing import Union

In [None]:
def aggregate(context,
              df_artifact: Union[DataItem, pd.core.frame.DataFrame],
              save_to: str = 'aggregated-df.pq', 
              keys: list = None, 
              metrics: list = None, 
              labels: list = None, 
              metric_aggs: list = ['mean'], 
              label_aggs: list = ['max'], 
              suffix: str = '', 
              window: int = 3, 
              center: bool = False, 
              inplace: bool = False,
              drop_na: bool = True,
              files_to_select: int = 1):
    """Time-series aggregation function
    
    Will perform a rolling aggregation on {df_artifact}, over {window} by the selected {keys}
    applying {metric_aggs} on {metrics} and {label_aggs} on {labels}. adding {suffix} to the 
    feature names.
    
    if not {inplace}, will return the original {df_artifact}, joined by the aggregated result.
    
    :param df_artifact: MLRun input pointing to pandas dataframe (csv/parquet file path) or a 
                        directory containing parquet files.
                        * When given a directory the latest {files_to_select} will be selected
    :param save_to:     Where to save the result dataframe.
                        * If relative will add to the {artifact_path}
    :param keys:        Subset of indexes from the source dataframe to aggregate by (default=all)
    :param metrics:     Array containing a list of metrics to run the aggregations on. (default=None) 
    :param labels:      Array containing a list of labels to run the aggregations on. (default=None) 
    :param metric_aggs: Array containing a list of aggregation function names to run on {metrics}.
                        (Ex: 'mean', 'std') (default='mean')
    :param label_aggs:  Array containing a list of aggregation function names to run on {metrics}.
                        (Ex: 'max', 'min') (default='max') 
    :param suffix:      Suffix to add to the feature name, E.g: <Feature_Name>_<Agg_Function>_<Suffix>
                        (Ex: 'last_60_mintes') (default='')
    :param window:      Window size to perform the rolling aggregate on. (default=3)
    :param center:      If True, Sets the value for the central sample in the window,
                        If False, will set the value to the last sample. (default=False)
    :param inplace:     If True, will return only the aggregated results.
                        If False, will join the aggregated results with the original dataframe
    :param drop_na:     Will drop na lines due to the Rolling.
    :param files_to_select: Specifies the number of *latest* files to select (and concat) for aggregation.
    """
    
    from_model = type(df_artifact) == pd.DataFrame
    if from_model:
        context.logger.info('Aggregating from Buffer')
        input_df = df_artifact
    else:
        if df_artifact.url.endswith('/'):   # is a directory?
            mpath = [os.path.join(df_artifact.url, file) for file in df_artifact.listdir() if file.endswith(('parquet', 'pq'))]
            files_by_updated = sorted(mpath, key=os.path.getmtime, reverse=True)
            context.logger.info(files_by_updated)
            latest = files_by_updated[:files_to_select]
            context.logger.info(f'Aggregating {latest}')
            input_df = pd.concat([context.get_dataitem(df).as_df() for df in latest])
        else:  # A regular artifact
            context.logger.info(f'Aggregating {df_artifact.url}')
            input_df = df_artifact.as_df()
    
    # Verify there is work to be done
    if not (metrics or labels):
        raise ValueError('please specify metrics or labels param')
    
    # Select the correct indexes
    if keys:
        current_index = input_df.index.names
        indexes_to_drop = [col for col in input_df.index.names if col not in keys]
        df = input_df.reset_index(level=indexes_to_drop)
    else:
        df = input_df
        
    # For each metrics
    if metrics:
        metrics_df = df.loc[:, metrics].rolling(window=window, center=center).aggregate(metric_aggs)
        
        # Flatten all the aggs
        metrics_df.columns = ['_'.join(col).strip() for col in metrics_df.columns.values]
        
        # Add suffix
        if suffix:
            metrics_df.columns = [f'{metric}_{suffix}' for metric in metrics_df.columns]
            
        if not inplace:
            final_df = pd.merge(input_df, metrics_df, suffixes=('', suffix), left_index=True, right_index=True)
        else:
            final_df = metrics_df

    # For each label
    if labels:
        labels_df = df.loc[:, labels].rolling(window=window,
                                              center=center).aggregate(label_aggs)
        # Flatten all the aggs
        labels_df.columns = ['_'.join(col).strip() for col in labels_df.columns.values]
        
        # Add suffix
        if suffix:
            labels_df.columns = [f'{label}_{suffix}' for label in labels_df.columns]
            
        if metrics:
            final_df = pd.merge(final_df, labels_df, suffixes=('', suffix), left_index=True, right_index=True)   
        else:
            if not inplace:
                final_df = pd.merge(input_df, labels_df, suffixes=('', suffix), left_index=True, right_index=True)      
            else:
                final_df = labels_df
                
    if drop_na:
        final_df = final_df.dropna()
        
    # Save the result dataframe
    context.logger.info('Logging artifact')
    if not from_model:
        context.log_dataset(key='aggregate', 
                            df=final_df, 
                            format='parquet',
                            local_path=save_to)
    else:
        return final_df

In [15]:
# nuclio: end-code

## Test
> This test uses the metrics data, created by the [Generator function](https://github.com/mlrun/demo-network-operations/blob/master/notebooks/generator.ipynb) from MLRun's [Network Operations Demo](https://github.com/mlrun/demo-network-operations)  
To test it yourself, please generate this dataset or use any of your available csv/parquet datasets.

In [1]:
from mlrun import code_to_function, mount_v3io, NewTask, mlconf, run_local
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [83]:
metrics_path = '/User/demo-network-operations/data/metrics.pq'

### Local Test
Define the aggregate test task

In [87]:
aggregate_task = NewTask(name='aggregate',
                         project='network-operations',
                         params={'metrics': ['cpu_utilization'],
                                 'labels': ['is_error'],
                                 'metric_aggs': ['mean', 'sum'],
                                 'label_aggs': ['max'],
                                 'suffix': 'daily',
                                 'inplace': False,
                                 'window': 5,
                                 'center': True,
                                 'save_to': 'aggregate.pq',
                                 'files_to_select': 2},
                         inputs={'df_artifact': metrics_path},
                         handler=aggregate)

In [90]:
aggregate_run = run_local(aggregate_task)

[mlrun] 2020-07-01 12:39:48,002 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-07-01 12:39:48,010 starting run aggregate uid=569432f3f2314d6b8c746aa05b31fcc0  -> http://mlrun-api:8080
[mlrun] 2020-07-01 12:39:48,070 Aggregating /User/demo-network-operations/data/metrics.pq
[mlrun] 2020-07-01 12:39:48,104 Logging artifact



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
network-operations,...5b31fcc0,0,Jul 01 12:39:48,completed,aggregate,v3io_user=adminkind=handlerowner=adminhost=jupyter-558bf7fbc8-kt6x9,df_artifact,"metrics=['cpu_utilization']labels=['is_error']metric_aggs=['mean', 'sum']label_aggs=['max']suffix=dailyinplace=Falsewindow=5center=Truesave_to=aggregate.pqfiles_to_select=2",,aggregate


to track results use .show() or .logs() or in CLI: 
!mlrun get run 569432f3f2314d6b8c746aa05b31fcc0 --project network-operations , !mlrun logs 569432f3f2314d6b8c746aa05b31fcc0 --project network-operations
[mlrun] 2020-07-01 12:39:48,244 run executed, status=completed


### Test on cluster

Convert the code to an MLRun function

In [92]:
fn = code_to_function('aggregate', handler='aggregate', code_output='function.py')
fn.spec.description = "Rolling aggregation over Metrics and Lables according to specifications"
fn.metadata.categories =  ["data-prep"]
fn.metadata.labels = {'author': 'orz'}
fn.export('function.yaml')

[mlrun] 2020-07-01 12:41:22,171 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9082849fd0>

In [None]:
aggregate_run = fn.apply(mount_v3io()).run(aggregate_task, artifact_path=os.path.abspath('./'))

### Show results

In [91]:
pd.read_parquet(aggregate_run.artifact('aggregate')['target_path'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,latency,packet_loss,throughput,is_error,cpu_utilization_mean_daily,cpu_utilization_sum_daily,is_error_max_daily
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-06-30 06:42:12.454,Wilson_LLC,Obrien_Mountain,0966571261270,80.900460,3.696670,0.000000,236.890025,False,78.730612,393.653058,0.0
2020-06-30 06:42:12.454,Wilson_LLC,Obrien_Mountain,8069812479542,82.345824,7.308960,2.567228,228.559851,False,75.966899,379.834497,0.0
2020-06-30 06:42:12.454,Bennett__Delacruz_and_Walls,Natasha_Harbors,5863502247054,85.551250,0.000000,0.000000,220.312402,False,79.098499,395.492493,0.0
2020-06-30 06:42:12.454,Bennett__Delacruz_and_Walls,Natasha_Harbors,4285071567351,68.295606,7.528826,0.000000,262.686165,False,75.348922,376.744612,0.0
2020-06-30 06:42:12.454,Bennett__Delacruz_and_Walls,Dominique_Branch,4579248894449,78.399353,0.000000,0.000000,231.813351,False,73.791123,368.955616,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2020-06-30 07:42:12.454,Wilson_LLC,Zachary_Drives,6001003522699,100.000000,100.000000,50.000000,0.000000,True,76.789171,383.945855,1.0
2020-06-30 07:42:12.454,Wilson_LLC,Obrien_Mountain,0966571261270,62.157867,1.911734,1.253481,248.963892,False,84.346471,421.732354,1.0
2020-06-30 07:42:12.454,Wilson_LLC,Obrien_Mountain,8069812479542,59.574487,0.000000,0.000000,269.816306,False,84.346471,421.732354,1.0
2020-06-30 07:42:12.454,Bennett__Delacruz_and_Walls,Natasha_Harbors,5863502247054,100.000000,100.000000,50.000000,0.000000,True,78.157074,390.785368,1.0
