# Nuclio - Generator function

## Environment

In [1]:
# nuclio: ignore
import nuclio

## NOTE: Restart the kernel if modules get installed

In [30]:
%run set_env.ipynb

mlrun                    0.4.7      
v3io-generator           0.0.27.dev0
pyarrow                  0.16.0     
pandas                   1.0.1      
pytimeparse              1.1.8      
Package                  Version    
------------------------ -----------
absl-py                  0.9.0      
adal                     1.2.2      
aiohttp                  3.6.2      
argo-models              2.2.1a0    
asn1crypto               1.3.0      
astor                    0.8.0      
async-timeout            3.0.1      
attrs                    19.3.0     
backcall                 0.1.0      
beautifulsoup4           4.8.2      
bleach                   3.1.4      
blinker                  1.4        
blosc                    1.7.0      
bokeh                    1.4.0      
boto3                    1.12.44    
botocore                 1.15.44    
cachetools               3.1.1      
certifi                  2020.4.5.1 
cffi                     1.14.0     
chardet                  3.0.4      
C

### Configurations

In [31]:
import os
import datetime

In [32]:
# nuclio: ignore
spec = nuclio.ConfigSpec(config={"spec.triggers.inference.kind":"cron",
                                "spec.triggers.inference.attributes.interval" :"10m",
                                "spec.readinessTimeoutSeconds" : 60,
                                "spec.minReplicas" : 1},
                         env={
                             'SAVE_TO' : NETAPP_MOUNT_PATH,
                             'METRICS_CONFIGURATION_FILEPATH' : os.path.join(APP_DIR,"configurations/metricsconfiguration.yaml"),
                             'SECS_TO_GENERATE' : 30000,
                             'metrics_table' : metrics_table,
                             'NETAPP_MOUNT_PATH' : NETAPP_MOUNT_PATH,
                            'SAVE_DEPLOYMENT' : 1,
                                'DEPLOYMENT_TABLE' : "netops_devices",
                                'SAVE_TO_TSDB' : 0},
                         cmd=['pip install pyarrow pandas pytimeparse faker',
                              'pip install v3io_frames pyyaml --upgrade',
                             'pip install -i https://test.pypi.org/simple/ v3io-generator'],
                         
                        ).with_v3io().add_volume(IGZ_MOUNT_PATH,IGZ_CONTAINER_PATH,kind='v3io',name='v3io')
if not NETAPP_SIM:
    spec.add_volume(NETAPP_MOUNT_PATH,NETAPP_PVC_CLAIM,kind='pvc',name='netapp')

## Function

In [33]:
# import os # Already imported earlier
import time
import yaml
import pandas as pd
import itertools

# DB Connection
import v3io_frames as v3f

# Data generator
from v3io_generator import metrics_generator, deployment_generator

### Helper functions

In [34]:
def _create_deployment():
    print('creating deployment')
    # Create meta-data factory
    dep_gen = deployment_generator.deployment_generator()
    faker=dep_gen.get_faker()

    # Design meta-data
    dep_gen.add_level(name='company',number=2,level_type=faker.company)
    dep_gen.add_level('data_center',number=2,level_type=faker.street_name)
    dep_gen.add_level('device',number=2,level_type=faker.msisdn)

    # Create meta-data
    deployment_df = dep_gen.generate_deployment()
    return deployment_df

In [35]:
def _is_deployment_exist(path):
    # Checking shared path for the devices table
    return os.path.exists(f'/v3io/bigdata/{path}')

In [36]:
def _get_deployment_from_kv(client, path):
    print(f'Retrieving deployment from {path}')
    context.logger.debug(f'Retrieving deployment from {path}')
    # Read the devices table from our KV store
    deployment_df = client.read(backend='kv', table=path)
    
    # Reset index to column
    deployment_df.index.name = 'device'
    deployment_df = deployment_df.reset_index()
    return deployment_df

In [37]:
def _save_deployment_to_kv(path, df, client=v3f.Client('framesd:8081')):
    # Save deployment to our KV store
    client.write(backend='kv', table='netops_devices',dfs=df, index_cols=['device'])

In [38]:
def get_or_create_deployment(path, save_to_cloud=False, client=None):
    if client and _is_deployment_exist(path):
        # Get deployment from KV
        deployment_df = _get_deployment_from_kv(client, path)
    else:
        # Create deployment
        deployment_df = _create_deployment()
        
        if client and save_to_cloud:
            _save_deployment_to_kv(path, deployment_df, client)

    return deployment_df

In [39]:
def set_indexes(df):
    df = df.set_index(['timestamp', 'company', 'data_center', 'device'])
    return df

In [40]:
def save_metrics_to_tsdb(context, metrics: pd.DataFrame):
    print('Saving metrics to TSDB',time.time())
    
    context.v3f.write('tsdb', context.metrics_table, metrics)

In [41]:
def save_metrics_to_parquet(context, metrics):
    print('Saving metrics to Parquet',time.time())
    df = pd.concat(itertools.chain(metrics))
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = df.reset_index()
    df['timestamp'] = df.loc[:, 'timestamp'].astype('datetime64[ms]')
    
    # Fix indexes
    df = set_indexes(df)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    print(filename,time.time())
    filepath = os.path.join(context.metrics_table, filename)
    print(filepath,time.time())
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)

In [42]:
def is_deployment_initialized(context):
    return hasattr(context, 'metric_generator')

### Init context

In [43]:
def init_context(context):
    
    # Get saving configuration
    save_to_tsdb = (int(os.getenv('SAVE_TO_TSDB', 1)) == 1)
    
    # Set metrics table
    metrics_table = os.path.join(os.getenv('NETAPP_MOUNT_PATH', 'netops_metrics'),os.getenv('metrics_table','metrics_table'))
    setattr(context, 'metrics_table', metrics_table) 

    # TSDB Based demo
    if save_to_tsdb:
        context.logger.debug('Saving to TSDB')
        # Create our DB client
        client = v3f.Client(address='framesd:8081', container='bigdata')
        
        # Create TSDB table if needed
        client.create('tsdb', metrics_table, rate='1/s', if_exists=1)
        
        # Set saving function
        setattr(context, 'write', save_metrics_to_tsdb)
    
    # Parquet based demo
    else:
        context.logger.debug('Saving to Parquet')
        # Set empty client for verification purposes
        client = None
          
        # Create saving directory
        filepath = os.path.join(metrics_table)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        
        # Set saving function
        setattr(context, 'write', save_metrics_to_parquet)
    
          
    # Set batch endtime
    secs_to_generate = os.getenv('SECS_TO_GENERATE', 10)
    setattr(context, 'secs_to_generate', secs_to_generate)
    
     
    
    # Generate or create deployment
    deployment_df = get_or_create_deployment(os.environ['DEPLOYMENT_TABLE'], os.environ['SAVE_DEPLOYMENT'], client)
    
    deployment_df['cpu_utilization'] = 70
    deployment_df['latency'] = 0
    deployment_df['packet_loss'] = 0
    deployment_df['throughput'] = 290
    deployment_df.head()
    
    # Get metrics configuration
    with open(os.getenv('METRICS_CONFIGURATION_FILEPATH', '/configurations/metrics-configuration.yaml'), 'r') as f:
        metrics_configuration = yaml.load(f)
        
    # Create metrics generator
    initial_timestamp = int(os.getenv('initial_timestamp', (datetime.datetime.now()-datetime.timedelta(days=1)).timestamp()))
    met_gen = metrics_generator.Generator_df(metrics_configuration, 
                                             user_hierarchy=deployment_df, 
                                             initial_timestamp=initial_timestamp)
    setattr(context, 'metric_generator', met_gen)
    
    # Set client
    setattr(context, 'v3f', client)


### Handler

In [44]:
def handler(context, event):
       
    # Create metrics generator based on YAML configuration and deployment
    metrics = context.metric_generator.generate_range(start_time=datetime.datetime.now(),
                                     end_time=datetime.datetime.now()+datetime.timedelta(seconds=int(context.secs_to_generate)),
                                     as_df=True,
                                     as_iterator=True)
    
    # Save Generated metrics
    context.write(context, metrics)

## Test

In [45]:
%%time
# nuclio: ignore
## 
#init_context(context)
#event = nuclio.Event(body='')
#output = handler(context, event)
#output

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.72 µs


# Deploy

In [46]:
# nuclio: ignore
#%nuclio deploy -p netops -n generator -c
addr = nuclio.deploy_file(name='generator',project='netops',spec=spec, tag='v1.1')


[nuclio] 2020-05-07 16:18:25,216 (info) Build complete
[nuclio] 2020-05-07 16:18:31,284 (info) Function deploy complete
[nuclio] 2020-05-07 16:18:31,289 done updating generator, function address: 3.136.152.229:32028


## Run the next line 2 or 3 times to generate data. It takes about 3 - 4 minutes to complete

In [48]:
!curl -X POST {addr}