# Nuclio - HA Logs to Parquet

**Objective:** Given a Kafka producer & topic / HTTP for HA-Proxy logs, rewrite them as minutely parquet files using nuclio.

In [2]:
# nuclio : ignore
import nuclio

## Environment

### Base config

In [300]:
%%nuclio config

# Kafka Trigger
# spec.triggers.hakafka.kind = "kafka"
# spec.trigger.url = "1.1.1.1"
# spec.triggers.hakafka.attributes.topic = "haproxy"
# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]
# spec.triggers.hakafka.attributes.sasl.enable: true
# spec.triggers.hakafka.attributes.sasl.user: ""
# spec.triggers.hakafka.attributes.sasl.password: ""

# HTTP Trigger      
spec.triggers.hahttp.kind="http"
spec.triggers.hahttp.maxWorkers=1
spec.triggers.hahttp.attributes.port=31001

# Ingestion verifyer 
spec.triggers.verifybatch.kind = "cron"
spec.triggers.verifybatch.attributes.interval = "1m"

# Base image
spec.build.baseImage = "rapidsai/rapidsai:cuda10.0-runtime-centos7"

%nuclio: setting spec.triggers.hahttp.kind to 'http'
%nuclio: setting spec.triggers.hahttp.maxWorkers to 1
%nuclio: setting spec.triggers.hahttp.attributes.port to 31001
%nuclio: setting spec.triggers.verifybatch.kind to 'cron'
%nuclio: setting spec.triggers.verifybatch.attributes.interval to '1m'
%nuclio: setting spec.build.baseImage to 'rapidsai/rapidsai:cuda10.0-runtime-centos7'


### Installations

In [304]:
%%nuclio cmd
# None Needed at the moment

### Env variables

In [169]:
%nuclio env SINK_PATH=./source
%nuclio env BATCHING_TIME_IN_SECONDS=60

%nuclio: setting 'SINK_PATH' environment variable
%nuclio: setting 'BATCHING_TIME_IN_SECONDS' environment variable


## Function

In [307]:
import os
import glob
from datetime import datetime, timedelta
import time
import cudf
import itertools
import json

### Helper functions

In [127]:
def add_log_to_batch(context, log):
    # No need to marshall json since that will happen when saving the batch
    context.batch.append(log)

In [128]:
def reset_batch(context):
    
    # Reset log list
    context.batch = list()
    
    # Reset batch end time
    context.batch_end_time += context.batch_interval

In [266]:
def _batch_to_df(context):
    '''
        Turns a json-string array to a full pandas dataframe
    '''
    df = cudf.concat([cudf.DataFrame(json.loads(log)) for log in itertools.chain(context.batch)])
    df = df.reset_index(drop=True)
    return df

In [291]:
def save_batch_to_parquet(context):
    
    # Handle file properties
    filename = f'{time.time()}.parquet'
    filepath = os.path.join(context.sink, filename)
    
    # Create DF from logs batch
    df = _batch_to_df(context)
    
    # Save to parquet
    df.to_parquet(filepath) 

### Main

In [1]:
def init_context(context):
        sink = os.getenv('SINK_PATH', './sink')
        setattr(context, 'sink', sink)
        
        # Verify sink is available
        os.makedirs(context.sink)
        
        batch_interval = int(os.getenv('BATCH_INTERVAL_IN_SECONDS', '60'))
        batch_interval = timedelta(seconds=batch_interval)
        setattr(context, 'batch_interval', batch_interval)
        
        batch_end_time = datetime.now() + batch_interval
        setattr(context, 'batch_end_time', batch_end_time)
        
        batch = list()
        setattr(context, 'batch', batch)

In [271]:
def handler(context, event):      
    # We can sometimes send a cron even
    # to make sure we keep the batch low
    if not event.trigger.kind == "cron":
        add_log_to_batch(context, event.body)
    
    if datetime.now() >= context.batch_end_time:
        save_batch_to_parquet(context)
        reset_batch(context)

# Test

In [295]:
# nuclio : ignore
init_context(context)

In [296]:
# nuclio : ignore
event = nuclio.Event(body='{"log_ip": "15.2.6.9", "syslog_timestamp": "May 28 00:00:09", "program": "haproxy", "pid": 164541, "client_ip": "126.52.74.15", "client_port": 52465, "accept_date": "28/May/2019:00:20:10.891", "frontend_name": "px-http", "backend_name": "px-http", "server_name": "srv2", "time_request": 1, "time_queue": 2, "time_backend_connect": 1, "time_backend_response": 5, "time_duration": 13, "http_status_code": 200, "bytes_read": 4, "captured_request": "-", "captured_response": "-", "termination_state": "----", "actconn": 2, "feconn": 5, "beconn": 1, "srvconn": 1, "retries": 1, "srv_queue": 1, "backend_queue": 3}')
out = handler(context, event)
out

# If has a nuclio cluster - Deploy

In [305]:
%nuclio show

%nuclio: notebook nuclio-halogs-kafka-to-parquet-cudf exported
Config:
apiVersion: nuclio.io/v1
kind: Function
metadata:
  annotations:
    nuclio.io/generated_by: function generated at 05-06-2019 from /rapids/notebooks/nvidia-igz/nuclio-halogs-kafka-to-parquet-cudf.ipynb
  labels: {}
  name: nuclio-halogs-kafka-to-parquet-cudf
spec:
  build:
    baseImage: rapidsai/rapidsai:cuda10.0-runtime-centos7
    commands: []
    functionSourceCode: IyBHZW5lcmF0ZWQgYnkgbnVjbGlvLmV4cG9ydC5OdWNsaW9FeHBvcnRlciBvbiAyMDE5LTA2LTA1IDA3OjQ1CgppbXBvcnQgbnVjbGlvCgppbXBvcnQgb3MKaW1wb3J0IGdsb2IKZnJvbSBkYXRldGltZSBpbXBvcnQgZGF0ZXRpbWUsIHRpbWVkZWx0YQppbXBvcnQgdGltZQppbXBvcnQgcGFuZGFzIGFzIHBkCmltcG9ydCBpdGVydG9vbHMKaW1wb3J0IGpzb24KCmRlZiBhZGRfbG9nX3RvX2JhdGNoKGNvbnRleHQsIGxvZyk6CiAgICBjb250ZXh0LmJhdGNoLmFwcGVuZChsb2cpCgpkZWYgcmVzZXRfYmF0Y2goY29udGV4dCk6CiAgICAKICAgIGNvbnRleHQuYmF0Y2ggPSBsaXN0KCkKICAgIAogICAgY29udGV4dC5iYXRjaF9lbmRfdGltZSArPSBjb250ZXh0LmJhdGNoX2ludGVydmFsCgpkZWYgX2JhdGNoX3RvX2RmKGNvbnRleHQpOgog

In [306]:
%nuclio build

%nuclio: notebook nuclio-halogs-kafka-to-parquet-cudf exported


In [297]:
%nuclio deploy -p nvidia -n kafka_to_parquet -c

ConnectionError: HTTPConnectionPool(host='localhost', port=8070): Max retries exceeded with url: /api/projects (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7feaf1c63588>: Failed to establish a new connection: [Errno 111] Connection refused',))