# Nuclio - HA Logs to Parquet

**Objective:** Given a Kafka producer & topic / HTTP for HA-Proxy logs, rewrite them as minutely parquet files using nuclio.

In [1]:
# nuclio : ignore
import nuclio

## Environment

### Base config

In [2]:
%%nuclio config

# Kafka Trigger
# spec.triggers.hakafka.kind = "kafka"
# spec.trigger.url = "1.1.1.1"
# spec.triggers.hakafka.attributes.topic = "haproxy"
# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]
# spec.triggers.hakafka.attributes.sasl.enable: true
# spec.triggers.hakafka.attributes.sasl.user: ""
# spec.triggers.hakafka.attributes.sasl.password: ""

# HTTP Trigger      
spec.triggers.hahttp.kind="http"
spec.triggers.hahttp.maxWorkers=1
spec.triggers.hahttp.attributes.port=31001

# Ingestion verifyer 
spec.triggers.verifybatch.kind = "cron"
spec.triggers.verifybatch.attributes.interval = "1m"

# Base image
spec.build.baseImage = "rapidsai/rapidsai:cuda10.0-runtime-centos7"

%nuclio: setting spec.triggers.hahttp.kind to 'http'
%nuclio: setting spec.triggers.hahttp.maxWorkers to 1
%nuclio: setting spec.triggers.hahttp.attributes.port to 31001
%nuclio: setting spec.triggers.verifybatch.kind to 'cron'
%nuclio: setting spec.triggers.verifybatch.attributes.interval to '1m'
%nuclio: setting spec.build.baseImage to 'rapidsai/rapidsai:cuda10.0-runtime-centos7'


### Installations

In [3]:
%%nuclio cmd
# None Needed at the moment

### Env variables

In [4]:
%nuclio env SINK_PATH=./source
%nuclio env BATCHING_TIME_IN_SECONDS=60

%nuclio: setting 'SINK_PATH' environment variable
%nuclio: setting 'BATCHING_TIME_IN_SECONDS' environment variable


## Function

In [5]:
import os
import glob
from datetime import datetime, timedelta
import time
import cudf
import itertools
import json

### Helper functions

In [6]:
def reset_batch(context):
    
    # Reset log list
    context.batch = list()
    
    # Reset batch end time
    context.batch_end_time += context.batch_interval

In [7]:
def save_batch_to_parquet(context):
    
    # Handle file properties
    filename = f'{time.time()}.parquet'
    filepath = os.path.join(context.sink, filename)
    
    # Create DF from logs batch
    df = cudf.read_json('\n'.join(context.batch), lines=True)
    df = df.reset_index(drop=True)
    
    # Save to parquet
    df.to_parquet(filepath) 

### Main

In [8]:
def init_context(context):
        sink = os.getenv('SINK_PATH', './sink')
        setattr(context, 'sink', sink)
        
        # Verify sink is available
        os.makedirs(context.sink, exist_ok=True)
        
        batch_interval = int(os.getenv('BATCH_INTERVAL_IN_SECONDS', '60'))
        batch_interval = timedelta(seconds=batch_interval)
        setattr(context, 'batch_interval', batch_interval)
        
        batch_end_time = datetime.now() + batch_interval
        setattr(context, 'batch_end_time', batch_end_time)
        
        batch = list()
        setattr(context, 'batch', batch)

In [9]:
def handler(context, event):   
    # Add event to batch
    context.batch.append(event.body)
    
    if datetime.now() >= context.batch_end_time:
        save_batch_to_parquet(context)
        reset_batch(context)

# Test

In [10]:
# nuclio : ignore
init_context(context)

In [13]:
# nuclio : ignore
event = nuclio.Event(body='{"company":"Rios__Pope_and_Baird","cpu_utilization":70.6942165035,"cpu_utilization_is_error":false,"latency":3.1373003261,"latency_is_error":false,"packet_loss":0.0,"packet_loss_is_error":false,"throughput":249.7207880994,"throughput_is_error":false,"timestamp":1563795193534}')
out = handler(context, event)
out

# If has a nuclio cluster - Deploy

In [None]:
%nuclio deploy -p nvidia -n kafka_to_parquet -c