# Nuclio - Generator function

## Environment

In [1]:
import nuclio

In [2]:
import os

base_path = os.path.abspath('../')
data_path = os.path.join(base_path, 'data')
src_path = os.path.join(base_path, 'src')
os.environ['data_path'] = data_path
os.environ['src_path'] = src_path

In [3]:
%nuclio config kind = "nuclio"

%nuclio: setting kind to 'nuclio'


### Configurations

### Setups
> Please make sure all the packages in the following `nuclio cmd` cell are installed, You may require to restart the kernel for the `pip install` to take effect.

In [None]:
# This cell contains our list of requirements
# We can add commands to run when building the image
# and specify required pip installs.
# We can use the `-c` flag to run the command only upon
# deployment and not when running locally within this notebook

# Install Util packages
%nuclio cmd python -m pip install pyyaml
%nuclio cmd python -m pip install pyarrow
%nuclio cmd python -m pip install pandas
%nuclio cmd python -m pip install pytimeparse

# Install Igz DB packages
%nuclio cmd python -m pip install v3io_frames --upgrade

# Install Function Specific packages
%nuclio cmd python -m pip install -i https://test.pypi.org/simple/ v3io-generator
%nuclio cmd python -m pip install faker

Show location to copy to `%nuclio env` configuration

In [5]:
# This cell contains our environment variables to be 
# used locally by the notebook or by the deployed function.
# You can specify `-l` for local only configs and `-c` for 
# cloud only configs

# Deployment
%nuclio env SAVE_DEPLOYMENT=1
%nuclio env DEPLOYMENT_TABLE=devices

# Metrics
%nuclio env METRICS_CONFIGURATION_FILEPATH={src_path}/metric_configurations.yaml

# Parquet
%nuclio env SAVE_TO={data_path}
%nuclio env SECS_TO_GENERATE=3600

# Save as
%nuclio env SAVE_TO_TSDB=0

%nuclio: setting 'SAVE_DEPLOYMENT' environment variable
%nuclio: setting 'DEPLOYMENT_TABLE' environment variable
%nuclio: setting 'METRICS_CONFIGURATION_FILEPATH' environment variable
%nuclio: setting 'SAVE_TO' environment variable
%nuclio: setting 'SECS_TO_GENERATE' environment variable
%nuclio: setting 'SAVE_TO_TSDB' environment variable


## Function

In [6]:
# nuclio: start-code

In [7]:
import os
import time
import yaml
import pandas as pd
import itertools
import datetime

# DB Connection
import v3io_frames as v3f

# Data generator
from v3io_generator import metrics_generator, deployment_generator

### Helper functions

In [8]:
def _create_deployment():
    print('creating deployment')
    # Create meta-data factory
    dep_gen = deployment_generator.deployment_generator()
    faker=dep_gen.get_faker()

    # Design meta-data
    dep_gen.add_level(name='company',number=2,level_type=faker.company)
    dep_gen.add_level('data_center',number=2,level_type=faker.street_name)
    dep_gen.add_level('device',number=2,level_type=faker.msisdn)

    # Create meta-data
    deployment_df = dep_gen.generate_deployment()
    return deployment_df

In [9]:
def _is_deployment_exist(path):
    # Checking shared path for the devices table
    return os.path.exists(f'/v3io/bigdata/{path}')

In [10]:
def _get_deployment_from_kv(client, path):
    print(f'Retrieving deployment from {path}')
    context.logger.debug(f'Retrieving deployment from {path}')
    # Read the devices table from our KV store
    deployment_df = client.read(backend='kv', table=path)
    
    # Reset index to column
    deployment_df.index.name = 'device'
    deployment_df = deployment_df.reset_index()
    return deployment_df

In [11]:
def _save_deployment_to_kv(path, df, client=None):
    # Save deployment to our KV store
    client.write(backend='kv', table='netops_devices',dfs=df, index_cols=['device'])

In [12]:
def get_or_create_deployment(path, save_to_cloud=False, client=None):
    if client and _is_deployment_exist(path):
        # Get deployment from KV
        deployment_df = _get_deployment_from_kv(client, path)
    else:
        # Create deployment
        deployment_df = _create_deployment()
        
        if client and save_to_cloud:
            _save_deployment_to_kv(path, deployment_df, client)

    return deployment_df

In [13]:
def set_indexes(df):
    df = df.set_index(['timestamp', 'company', 'data_center', 'device'])
    return df

In [14]:
def save_metrics_to_tsdb(context, metrics: pd.DataFrame):
    print('Saving metrics to TSDB')
    
    context.v3f.write('tsdb', context.metrics_table, metrics)

In [15]:
def save_metrics_to_parquet(context, metrics):
    print('Saving metrics to Parquet')
    df = pd.concat(itertools.chain(metrics))
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = df.reset_index()
    df['timestamp'] = df.loc[:, 'timestamp'].astype('datetime64[ms]')
    
    # Fix indexes
    df = set_indexes(df)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    print(filename)
    filepath = os.path.join(context.metrics_table, filename)
    print(filepath)
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)

In [16]:
def is_deployment_initialized(context):
    return hasattr(context, 'metric_generator')

### Init context

In [17]:
def init_context(context):
    
    # Get saving configuration
    save_to_tsdb = (int(os.getenv('SAVE_TO_TSDB', 1)) == 1)
    
    # Set metrics table
    metrics_table = os.getenv('SAVE_TO', 'netops_metrics')
    setattr(context, 'metrics_table', metrics_table) 

    # TSDB Based demo
    if save_to_tsdb:
        context.logger.debug('Saving to TSDB')
        # Create our DB client
        client = v3f.Client(address='framesd:8081', container='bigdata')
        
        # Create TSDB table if needed
        client.create('tsdb', metrics_table, rate='1/s', if_exists=1)
        
        # Set saving function
        setattr(context, 'write', save_metrics_to_tsdb)
    
    # Parquet based demo
    else:
        context.logger.debug('Saving to Parquet')
        # Set empty client for verification purposes
        client = None
          
        # Create saving directory
        filepath = os.path.join(metrics_table)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        
        # Set saving function
        setattr(context, 'write', save_metrics_to_parquet)
    
          
    # Set batch endtime
    secs_to_generate = os.getenv('SECS_TO_GENERATE', 10)
    setattr(context, 'secs_to_generate', secs_to_generate)
    
     
    
    # Generate or create deployment
    deployment_df = get_or_create_deployment(os.environ['DEPLOYMENT_TABLE'], os.environ['SAVE_DEPLOYMENT'], client)
    # Convert to log_dataset
    
    deployment_df['cpu_utilization'] = 70
    deployment_df['latency'] = 0
    deployment_df['packet_loss'] = 0
    deployment_df['throughput'] = 290
    deployment_df.head()
    
    # Get metrics configuration
    # Move to get-object from store:///
    with open(os.getenv('METRICS_CONFIGURATION_FILEPATH', '/configurations/metrics_configuration.yaml'), 'r') as f:
        metrics_configuration = yaml.load(f)
        
    # Create metrics generator
    initial_timestamp = int(os.getenv('initial_timestamp', (datetime.datetime.now()-datetime.timedelta(days=1)).timestamp()))
    met_gen = metrics_generator.Generator_df(metrics_configuration, 
                                             user_hierarchy=deployment_df, 
                                             initial_timestamp=initial_timestamp)
    setattr(context, 'metric_generator', met_gen)
    
    # Set client
    setattr(context, 'v3f', client)


### Handler

In [18]:
def handler(context, event):
       
    # Create metrics generator based on YAML configuration and deployment
    metrics = context.metric_generator.generate_range(start_time=datetime.datetime.now(),
                                     end_time=datetime.datetime.now()+datetime.timedelta(seconds=int(context.secs_to_generate)),
                                     as_df=True,
                                     as_iterator=True)
    
    # Save Generated metrics
    context.write(context, metrics)

In [19]:
# nuclio: end-code

### Generate data configuration file

In [20]:
%%writefile {os.environ['METRICS_CONFIGURATION_FILEPATH']}
errors: {length_in_ticks: 50, rate_in_ticks: 150}
timestamps: {interval: 5s, stochastic_interval: true}
metrics:
  cpu_utilization:
    accuracy: 2
    distribution: normal
    distribution_params: {mu: 70, noise: 0, sigma: 10}
    is_threshold_below: true
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 100, min: 0, validate: true}
  latency:
    accuracy: 2
    distribution: normal
    distribution_params: {mu: 0, noise: 0, sigma: 5}
    is_threshold_below: true
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 100, min: 0, validate: true}
  packet_loss:
    accuracy: 0
    distribution: normal
    distribution_params: {mu: 0, noise: 0, sigma: 2}
    is_threshold_below: true
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 50, min: 0, validate: true}
  throughput:
    accuracy: 2
    distribution: normal
    distribution_params: {mu: 250, noise: 0, sigma: 20}
    is_threshold_below: false
    past_based_value: false
    produce_max: false
    produce_min: false
    validation:
      distribution: {max: 1, min: -1, validate: false}
      metric: {max: 300, min: 0, validate: true}

Overwriting /User/mlrun-demos/demos/network-operations/src/metric_configurations.yaml


## Generate dataset locally
Running this step will generate the base data file for the project.  
This will be used as our training dataset later on in the [project notebook](../project.ipynb).

In [21]:
# nuclio: ignore
init_context(context)
event = nuclio.Event(body='')
output = handler(context, event)
output

creating deployment
Saving metrics to Parquet


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.


20210126T124327-20210126T134327.parquet
/User/mlrun-demos/demos/network-operations/data/20210126T124327-20210126T134327.parquet


# Deploy to cluster
(For streaming demo)

In [24]:
from mlrun import code_to_function, mount_v3io

fn = code_to_function(name='nuclio-generator',
                      kind='nuclio', with_doc=False)
fn.spec.base_spec['spec']['build']['baseImage'] = 'mlrun/ml-models'
fn.add_trigger('cron', nuclio.triggers.CronTrigger(interval='1m'))

<mlrun.runtimes.function.RemoteRuntime at 0x7f64aaddff90>

In [25]:
fn.save()
fn.export('../src/generator.yaml')

> 2020-12-22 08:40:56,147 [info] function spec saved to path: ../src/generator.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7f64aaddff90>

In [26]:
fn.apply(mount_v3io())
fn.deploy(project='network-operations')

> 2020-12-22 08:40:56,153 [info] Starting remote function deploy
2020-12-22 08:40:56  (info) Deploying function
2020-12-22 08:40:56  (info) Building
2020-12-22 08:40:56  (info) Staging files and preparing base images
2020-12-22 08:40:56  (info) Building processor image
2020-12-22 08:41:24  (info) Build complete
2020-12-22 08:41:32  (info) Function deploy complete
> 2020-12-22 08:41:33,659 [info] function deployed, address=default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:30911


'http://default-tenant.app.lewpwntlsyrb.iguazio-cd1.com:30911'