# Nuclio
## Unified Data batching & Agg function

In [2]:
# nuclio: ignore
import nuclio

## Environment

### Base config

In [13]:
%%nuclio config

# Kafka Trigger
# spec.triggers.hakafka.kind = "kafka"
# spec.trigger.url = "1.1.1.1"
# spec.triggers.hakafka.attributes.topic = "haproxy"
# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]
# spec.triggers.hakafka.attributes.sasl.enable: true
# spec.triggers.hakafka.attributes.sasl.user: ""
# spec.triggers.hakafka.attributes.sasl.password: ""

# HTTP Trigger      
spec.triggers.hahttp.kind="http"
spec.triggers.hahttp.maxWorkers=1
spec.triggers.hahttp.attributes.port=31001

# Ingestion verifyer 
spec.triggers.verifybatch.kind = "cron"
spec.triggers.verifybatch.attributes.interval = "1m"

# Base image
spec.build.baseImage = "rapidsai/rapidsai:cuda10.0-runtime-centos7"

%nuclio: setting spec.triggers.hahttp.kind to 'http'
%nuclio: setting spec.triggers.hahttp.maxWorkers to 1
%nuclio: setting spec.triggers.hahttp.attributes.port to 31001
%nuclio: setting spec.triggers.verifybatch.kind to 'cron'
%nuclio: setting spec.triggers.verifybatch.attributes.interval to '1m'
%nuclio: setting spec.build.baseImage to 'rapidsai/rapidsai:cuda10.0-runtime-centos7'


### Build commands

In [6]:
%%nuclio cmd
# None Needed at the moment

### Env variables

In [9]:
%nuclio env SINK_PATH=./sink
%nuclio env BATCHING_TIME_IN_SECONDS=60

%nuclio: setting 'SINK_PATH' environment variable
%nuclio: setting 'BATCHING_TIME_IN_SECONDS' environment variable


## Function

In [8]:
import os
import glob
from datetime import datetime, timedelta
import time
import cudf
import itertools
import json

## Helper functions

In [10]:
def add_log_to_batch(context, log):
    # No need to marshall json since that will happen when saving the batch
    context.batch.append(log)

In [14]:
def reset_batch(context):
    
    # Reset log list
    context.batch = list()
    
    # Reset batch end time
    context.batch_end_time += context.batch_interval

In [15]:
def _batch_to_df(context):
    '''
        Turns a json-string array to a full pandas dataframe
    '''
    df = cudf.read_json('\n'.join(context.batch), lines=True)
    df = df.reset_index(drop=True)
    return df

In [21]:
def df_to_parquet(context, df):
    filename = f'{time.time()}.parquet'
    filepath = os.path.join(context.sink, filename)
    
    df.to_parquet(filepath)

### Main function code

In [17]:
def init_context(context):
        sink = os.getenv('SINK_PATH', './sink')
        setattr(context, 'sink', sink)
        
        # Verify sink is available
        os.makedirs(context.sink, exist_ok=True)
        
        batch_interval = int(os.getenv('BATCH_INTERVAL_IN_SECONDS', '60'))
        batch_interval = timedelta(seconds=batch_interval)
        setattr(context, 'batch_interval', batch_interval)
        
        batch_end_time = datetime.now() + batch_interval
        setattr(context, 'batch_end_time', batch_end_time)
        
        batch = list()
        setattr(context, 'batch', batch)

In [22]:
def handler(context, event):      
    add_log_to_batch(context, event.body)
    
    if datetime.now() >= context.batch_end_time:
        df = _batch_to_df(context)
        if not df.empty:
            df = df.groupby(['log_ip']).agg({'feconn':'mean',
                                                    'beconn':'mean',
                                                    'time_backend_response':'max',
                                                    'time_backend_response':'mean',
                                                    'time_queue':'mean',
                                                    'time_duration': 'mean',
                                                    'time_request': 'mean',
                                                    'time_backend_connect':'mean'
                                                   })
        df_to_parquet(context, df)
        reset_batch(context)

## Test

In [23]:
# nuclio : ignore
init_context(context)

In [62]:
# nuclio : ignore
event = nuclio.Event(body='{"log_ip": "15.2.6.9", "syslog_timestamp": "May 28 00:00:09", "program": "haproxy", "pid": 164541, "client_ip": "126.52.74.15", "client_port": 52465, "accept_date": "28/May/2019:00:20:10.891", "frontend_name": "px-http", "backend_name": "px-http", "server_name": "srv2", "time_request": 1, "time_queue": 2, "time_backend_connect": 1, "time_backend_response": 5, "time_duration": 13, "http_status_code": 200, "bytes_read": 4, "captured_request": "-", "captured_response": "-", "termination_state": "----", "actconn": 2, "feconn": 5, "beconn": 1, "srvconn": 1, "retries": 1, "srv_queue": 1, "backend_queue": 3}')
out = handler(context, event)
out

## Deploy (If a nuclio cluster is available)

In [None]:
%nuclio deploy -p nvidia -n batch_and_agg -c