# Network Operations
## Pre-Processing

In [1]:
# nuclio: ignore
import nuclio

Define the MLRun environment

In [2]:
%nuclio config kind = "job"
%nuclio config spec.image = "mlrun/ml-models"

%nuclio: setting kind to 'job'
%nuclio: setting spec.image to 'mlrun/ml-models'


## Function

In [3]:
import os
import pandas as pd
from mlrun.datastore import DataItem

In [4]:
def aggregate(context,
              df_artifact: DataItem,
              save_to: str = 'aggregated-df.pq', 
              keys: list = None, 
              metrics: list = None, 
              labels: list = None, 
              metric_aggs: list = ['mean'], 
              label_aggs: list = ['max'], 
              suffix: str = '', 
              window: int = 3, 
              center: bool = False, 
              inplace: bool = False):
    """Time-series aggregation function
    
    Will perform a rolling aggregation on {df_artifact}, over {window} by the selected {keys}
    applying {metric_aggs} on {metrics} and {label_aggs} on {labels}. adding {suffix} to the 
    feature names.
    
    if not {inplace}, will return the original {df_artifact}, joined by the aggregated result.
    
    :param df_artifact: MLRun input pointing to pandas dataframe (csv/parquet file path)
    :param save_to:     Where to save the result dataframe.
                        * If relative will add to the {artifact_path}
    :param keys:        Subset of indexes from the source dataframe to aggregate by (default=all)
    :param metrics:     Array containing a list of metrics to run the aggregations on. (default=None) 
    :param labels:      Array containing a list of labels to run the aggregations on. (default=None) 
    :param metric_aggs: Array containing a list of aggregation function names to run on {metrics}.
                        (Ex: 'mean', 'std') (default='mean')
    :param label_aggs:  Array containing a list of aggregation function names to run on {metrics}.
                        (Ex: 'max', 'min') (default='max') 
    :param suffix:      Suffix to add to the feature name, E.g: <Feature_Name>_<Agg_Function>_<Suffix>
                        (Ex: 'last_60_mintes') (default='')
    :param window:      Window size to perform the rolling aggregate on. (default=3)
    :param center:      If True, Sets the value for the central sample in the window,
                        If False, will set the value to the last sample. (default=False)
    :param inplace:     If True, will return only the aggregated results.
                        If False, will join the aggregated results with the original dataframe
    """
    
    
    context.logger.info(f'Aggregating {df_artifact.url}')
    input_df = df_artifact.as_df()
    
    # Verify there is work to be done
    if not (metrics or labels):
        raise ValueError('please specify metrics or labels param')
    
    # Select the correct indexes
    if keys:
        current_index = input_df.index.names
        indexes_to_drop = [col for col in input_df.index.names if col not in keys]
        df = input_df.reset_index(level=indexes_to_drop)
    else:
        df = input_df
    
    # For each metrics
    if metrics:
        metrics_df = df.loc[:, metrics].rolling(window=window,
                                                center=center).aggregate(metric_aggs)
        
        # Flatten all the aggs
        metrics_df.columns = ['_'.join(col).strip() for col in metrics_df.columns.values]
        
        # Add suffix
        if suffix:
            metrics_df.columns = [f'{metric}_{suffix}' for metric in metrics_df.columns]
            
        if not inplace:
            final_df = pd.merge(input_df, metrics_df, suffixes=('', suffix), left_index=True, right_index=True)
        else:
            final_df = metrics_df

    # For each label
    if labels:
        labels_df = df.loc[:, labels].rolling(window=window,
                                              center=center).aggregate(label_aggs)
        # Flatten all the aggs
        labels_df.columns = ['_'.join(col).strip() for col in labels_df.columns.values]
        
        # Add suffix
        if suffix:
            labels_df.columns = [f'{label}_{suffix}' for label in labels_df.columns]
            
        if metrics:
            final_df = pd.merge(final_df, labels_df, suffixes=('', suffix), left_index=True, right_index=True)   
        else:
            if not inplace:
                final_df = pd.merge(input_df, labels_df, suffixes=('', suffix), left_index=True, right_index=True)      
            else:
                final_df = labels_df
        
    # Save the result dataframe
    context.log_dataset(key='aggregate', 
                        df=final_df, 
                        format='parquet',
                        local_path=save_to)

In [5]:
# nuclio: end-code

## Test
> This test uses the metrics data, created by the [Generator function](https://github.com/mlrun/demo-network-operations/blob/master/notebooks/generator.ipynb) from MLRun's [Network Operations Demo](https://github.com/mlrun/demo-network-operations)  
To test it yourself, please generate this dataset or use any of your available csv/parquet datasets.

In [6]:
from mlrun import code_to_function, mount_v3io, NewTask, mlconf, run_local
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

In [7]:
metrics_path = '/User/v3io/bigdata/netops_metrics_parquet/20200329T133835-20200329T143835.parquet'
metrics = pd.read_parquet('/User/v3io/bigdata/netops_metrics_parquet/20200329T133835-20200329T143835.parquet')

### Local Test
Define the aggregate test task

In [8]:
aggregate_task = NewTask(name='aggregate',
                         project='network-operations',
                         params={'metrics': ['cpu_utilization'],
                                 'labels': ['is_error'],
                                 'metric_aggs': ['mean', 'sum'],
                                 'label_aggs': ['max'],
                                 'suffix': 'daily',
                                 'inplace': False,
                                 'window': 5,
                                 'center': True,
                                 'save_to': 'aggregate.pq'},
                         inputs={'df_artifact': metrics_path},
                         handler=aggregate)

In [9]:
aggregate_run = run_local(aggregate_task)

[mlrun] 2020-05-04 14:13:43,871 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-05-04 14:13:43,958 starting run aggregate uid=332bd4f750584bc8a5f08f96e8d048b5  -> http://10.194.95.255:8080
[mlrun] 2020-05-04 14:13:44,119 Aggregating /User/v3io/bigdata/netops_metrics_parquet/20200329T133835-20200329T143835.parquet
[mlrun] 2020-05-04 14:13:44,633 log artifact aggregate at aggregate.pq, size: 281983, db: Y



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
network-operations,...e8d048b5,0,May 04 14:13:44,completed,aggregate,v3io_user=adminkind=handlerowner=adminhost=jupyter-76c7c6dbb5-nzx9m,df_artifact,"metrics=['cpu_utilization']labels=['is_error']metric_aggs=['mean', 'sum']label_aggs=['max']suffix=dailyinplace=Falsewindow=5center=Truesave_to=aggregate.pq",,aggregate


to track results use .show() or .logs() or in CLI: 
!mlrun get run 332bd4f750584bc8a5f08f96e8d048b5 --project network-operations , !mlrun logs 332bd4f750584bc8a5f08f96e8d048b5 --project network-operations
[mlrun] 2020-05-04 14:13:44,986 run executed, status=completed


### Test on cluster

Convert the code to an MLRun function

In [10]:
fn = code_to_function('aggregate', handler='aggregate')
fn.spec.description = "Rolling aggregation over Metrics and Lables according to specifications"
fn.metadata.categories =  ["data-prep"]
fn.metadata.labels = {'author': 'orz'}
fn.export('function.yaml')

[mlrun] 2020-05-04 14:14:01,389 function spec saved to path: function.yaml


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f4b2e0a13c8>

In [11]:
aggregate_run = fn.apply(mount_v3io(remote='bigdata', mount_path='/User/v3io/bigdata')).run(aggregate_task)

[mlrun] 2020-05-04 14:14:01,425 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-05-04 14:14:01,459 starting run aggregate uid=85dbdb7a453845ad9530b2d3e076229f  -> http://10.194.95.255:8080
[mlrun] 2020-05-04 14:14:02,141 Job is running in the background, pod: aggregate-tnhbl
[mlrun] 2020-05-04 14:19:06,359 artifact path is not defined or is local, artifacts will not be visible in the UI
[mlrun] 2020-05-04 14:19:06,544 Aggregating /User/v3io/bigdata/netops_metrics_parquet/20200329T133835-20200329T143835.parquet
[mlrun] 2020-05-04 14:19:07,271 log artifact aggregate at aggregate.pq, size: 281973, db: Y

[mlrun] 2020-05-04 14:19:08,625 run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
network-operations,...e076229f,0,May 04 14:19:06,completed,aggregate,host=aggregate-tnhblkind=jobowner=adminv3io_user=admin,df_artifact,"center=Trueinplace=Falselabel_aggs=['max']labels=['is_error']metric_aggs=['mean', 'sum']metrics=['cpu_utilization']save_to=aggregate.pqsuffix=dailywindow=5",,aggregate


to track results use .show() or .logs() or in CLI: 
!mlrun get run 85dbdb7a453845ad9530b2d3e076229f --project network-operations , !mlrun logs 85dbdb7a453845ad9530b2d3e076229f --project network-operations
[mlrun] 2020-05-04 14:19:15,047 run executed, status=completed


### Show results

In [12]:
pd.read_parquet(aggregate_run.artifact('aggregate')['target_path'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cpu_utilization,cpu_utilization_is_error,latency,latency_is_error,packet_loss,packet_loss_is_error,throughput,throughput_is_error,is_error,cpu_utilization_mean_daily,cpu_utilization_sum_daily,is_error_max_daily
timestamp,company,data_center,device,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-03-29 13:38:35.724,Barajas_Inc,Spencer_Passage,1455939272629,76.896775,False,5.601567,False,0.000000,False,255.524264,False,False,,,
2020-03-29 13:38:35.724,Barajas_Inc,Spencer_Passage,3191131965591,76.669283,False,5.210492,False,1.262219,False,251.695845,False,False,,,
2020-03-29 13:38:35.724,Barajas_Inc,Robert_Neck,2672113087827,69.210951,False,5.253616,False,0.000000,False,247.094953,False,False,74.511729,372.558646,0.0
2020-03-29 13:38:35.724,Barajas_Inc,Robert_Neck,3843914939669,81.610270,False,5.649512,False,0.000000,False,244.269917,False,False,74.539431,372.697157,0.0
2020-03-29 13:38:35.724,Flores_LLC,Rogers_Pine,3087282286440,68.171366,False,0.000000,False,1.335228,False,245.064308,False,False,71.923404,359.617019,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-29 14:38:35.724,Barajas_Inc,Robert_Neck,3843914939669,68.757036,False,0.000000,False,0.301766,False,266.073385,False,False,74.521988,372.609942,1.0
2020-03-29 14:38:35.724,Flores_LLC,Rogers_Pine,3087282286440,66.365714,False,0.000000,False,0.000000,False,256.536286,False,False,74.461327,372.306637,1.0
2020-03-29 14:38:35.724,Flores_LLC,Rogers_Pine,2298772615800,64.069030,False,0.000000,False,0.653190,False,265.026044,False,False,74.461327,372.306637,1.0
2020-03-29 14:38:35.724,Flores_LLC,Hammond_Island,3955652932752,73.114857,False,7.498315,False,0.000000,False,257.854554,False,False,,,
