# Nuclio - HA Logs (parquet) Aggregation computation

**Objective:** Compute log metrics from timely parquet files

In [4]:
# nuclio : ignore
import nuclio

## Environment

### Base config

In [233]:
%%nuclio config

## TODO: Change to kafka / HTTP stream trigger

# Trigger
spec.triggers.json2parquet.kind = "cron"
spec.triggers.json2parquet.attributes.interval = "1m"

# Base image
spec.build.baseImage = "rapidsai/rapidsai:cuda10.0-runtime-centos7"

%nuclio: setting spec.triggers.json2parquet.kind to 'cron'
%nuclio: setting spec.triggers.json2parquet.attributes.interval to '1m'
%nuclio: setting spec.build.baseImage to 'rapidsai/rapidsai:cuda10.0-runtime-centos7'


### Installations

In [6]:
%%nuclio cmd
pip install pandas
pip install pyarrow
pip install cudf



### Env variables

In [118]:
%nuclio env SOURCE_PATH=./source
%nuclio env SINK_PATH=./sink
%nuclio env BATCH_TIME_IN_SECS=60

%nuclio: setting 'SOURCE_PATH' environment variable
%nuclio: setting 'SINK_PATH' environment variable
%nuclio: setting 'BATCH_TIME_IN_SECS' environment variable


## Function

In [63]:
import os
import glob
from datetime import datetime, timedelta
import time
import pandas as pd
import cudf
import itertools

### Helper functions

In [217]:
def read_files(context):
    parquet_path = os.path.join(context.source, '*.parquet')
    
    # List parquet files in source folder
    files = itertools.filterfalse(
        lambda file: datetime.fromtimestamp(int(file.split('/')[-1].split('.')[0])) < (context.batch_time - context.batch_interval), 
        glob.glob(parquet_path))  

    # Create cuDF dataframes from files
    files = [cudf.read_parquet(file) for file in files]
    
    # Return joined DF
    if len(files) == 1:
        print('one file')
        return files[0]
    if len(files) > 1:
        return cudf.concat(files)
    else:
        print('empty')
        return cudf.DataFrame()

In [204]:
def update_batch_time(context):
    context.batch_time += context.batch_interval

In [227]:
def df_to_parquet(context, df):
    filename = f'{time.time()}.parquet'
    filepath = os.path.join(context.sink, filename)
    
    df.to_parquet(filepath)

### Main

In [1]:
def init_context(context):
        source = os.getenv('SOURCE_PATH', 'source')
        setattr(context, 'source', source)
        
        sink = os.getenv('SINK_PATH', 'sink')
        setattr(context, 'sink', sink)
        
        # Verify source and sink are available
        os.makedirs(context.source)
        os.makedirs(context.sink)
        
        batch_time = datetime.now()
        setattr(context, 'batch_time', batch_time)
        
        batch_interval = int(os.getenv('BATCH_INTERVAL_IN_SECONDS', '60'))
        batch_interval = timedelta(seconds=batch_interval)
        setattr(context, 'batch_interval', batch_interval)

In [235]:
def handler(context, event):
    df = read_files(context)

    # Make sure we aggregate on a non-empty df
    if not df.empty:
        df = df.groupby(['log_ip']).agg({'feconn':'mean',
                                                'beconn':'mean',
                                                'time_backend_response':'max',
                                                'time_backend_response':'mean',
                                                'time_queue':'mean',
                                                'time_duration': 'mean',
                                                'time_request': 'mean',
                                                'time_backend_connect':'mean'
                                               })
    df_to_parquet(context, df)
    update_batch_time(context)

# Test

In [236]:
# nuclio : ignore

init_context(context)
event = nuclio.Event(body='')
out = handler(context, event)
out

empty




# If has a nuclio cluster - Deploy

In [None]:
%nuclio deploy -p nvidia -n json2parquet -c