In [3]:
# NOTE You must have Kafka and Elasticsearch started for this example to work
!pip install -q confluent-kafka

In [4]:
from datetime import datetime 
from typing import List
import pandas as pd
import re, string, time, json 
from confluent_kafka import Producer
import requests
import hashlib

DELAY_MS = 1000

def extractTimeStamp(tokens :List[str]):
    '''
    Assuming line has valid timestamp in the first place...
    line = "[I 2021-11-20 18:05:06.587 JupyterHub log:189] "
    '''
    try:
        extractedDateTimeString  = tokens[1] + " " + tokens[2]
    except IndexError:
        return None
    
    try:
        return  datetime.fromisoformat(extractedDateTimeString)
    except ValueError:
        try:
            return datetime.strptime(extractedDateTimeString, "%Y%m%d %H:%M:%S.%f")
        except ValueError:
            try:
                return datetime.strptime("20" + extractedDateTimeString,"%Y%m%d %H:%M:%S")
            except ValueError:
                return None

def extractLevel(tokens : List[str]):
    if len(tokens) == 0: return None
    if tokens[0].startswith("[I"):
        level = "INFO"
    elif tokens[0].startswith("[E"):
        level = "ERROR"
    elif tokens[0].startswith("[W"):
        level = "WARN"
    else:
        level = None
    return level 

def extractNetid(line :str):
    '''
    Several cases to yank out a netid from a line
    
    generic searches happen last
    '''
    searches = [
        { "pattern": r"User \w+ took \d+.\d+ seconds to start", "split": " ", "index": 1 },
        { "pattern": r"User \w+ server took \d+.\d+ seconds to stop", "split": " ", "index": 1 },   
        { "pattern": r"User \w+\: server is slow to stop", "split": " ", "index": 1 },           
        { "pattern" : r"\w+ still running$", "split": " ","index": 0 },
        { "pattern": r"Removing user \w+ from proxy", "split": " ", "index": 2 },
        { "pattern": r"/hub/api/users/\w+", "split": "/", "index": -1 },
        { "pattern": r"Adding user \w+ to proxy", "split": " ", "index": 2 },
        { "pattern": r"Server \w+ is ready", "split": " ", "index": 1 },
        { "pattern": r"\w+ is pending spawn", "split": " ", "index": 0 },
        { "pattern": r"User logged \w+: \w+", "split": " ", "index": -1 },
        { "pattern": r"Culling server \w+", "split": " ", "index": -1 },
        { "pattern": r"Failed to cleanup \w+", "split": " ", "index": -1 },
        { "pattern": r"\w+ does not appear to be running", "split": " ", "index": 0 },

        { "pattern": r"claim-\w+", "split": "-", "index": -1 },
        { "pattern": r"jupyter\-\w+", "split": "-", "index": -1 },
        { "pattern": r"jupyterhub-user-\w+", "split": "-", "index": -1 },
        { "pattern": r"/user/\w+", "split": "/", "index": -1 },
        { "pattern": r"%2Fuser%2F\w+", "split": "%2F", "index": -1 },
        { "pattern": r"\w+@\d+.\d+.\d+.\d+", "split": "@", "index": 0 }        
    ]
    netid = None
    for search in searches:
        match = re.search(search["pattern"],line)
        if match:
            extract = line[match.start():match.end()].split(search["split"])[search["index"]]
            netid = re.sub(r'[^\w\s]', '', extract)
            break
    
    return netid

def extractEvent(line:str,netid:str):
    events_to_track =  { 
        "ACTIVITY":f"/hub/api/users/{netid}/activity",
        "LOGIN": "User logged in:", 
        "START" : "seconds to start", 
        "STOP": "seconds to stop", 
        "CULL":"Culling server"
    }
    for key in events_to_track.keys():
        if line.find(events_to_track[key])>=0:
            return key
    return None

def toKafka(producer:Producer, topic:str, data):
    encoded = json.dumps(data).encode('utf-8')
    print(json.dumps(data))
    producer.produce(topic ,encoded)
    producer.flush()
    
def toElastic(index:str, data):
    elastic_host = "elasticsearch"
    elastic_port = "9200"
    url = f"http://{elastic_host}:{elastic_port}/{index}"
    headers = { "Content-Type" : "application/json" }
    data_str = json.dumps(data)
    hash_object = hashlib.sha1(data_str.encode())
    data_id = hash_object.hexdigest()
    endpoint = f"{url}/_doc/{data_id}"
    response = requests.post(endpoint, headers = headers, data = data_str)
    response.raise_for_status()
    
def resetElasticIndex(index):
    elastic_host = "elasticsearch"
    elastic_port = "9200"
    url = f"http://{elastic_host}:{elastic_port}/{index}"
    headers = { "Content-Type" : "application/json" }
    endpoint = f"{url}"
    response = requests.delete(endpoint, headers = headers)
    return response.ok
    

In [5]:
with open("/home/jovyan/datasets/jupyterhub/hub.log","r") as f:
    lines = f.readlines()

producer = Producer({'bootstrap.servers' : 'broker:29092'})
#resetElasticIndex("jupyterhub")

session = {}
for line in lines:
    time.sleep(DELAY_MS/1000)
    elapsedTime = None
    tokens = line.strip().split()
    level = extractLevel(tokens)
    timestamp = extractTimeStamp(tokens)
    netid = extractNetid(line)
    event = extractEvent(line,netid)
    if event == "START":
        session[netid] = timestamp 
    elif event == "STOP" and session.get(netid,None) is not None:        
        elapsedTime = timestamp - session[netid]
        del session[netid]

    # require at LEAST a level and timestamp before writing a log
    if level is not None and timestamp is not None:
        log = { "level": level, 
               "timestamp_str" : timestamp.isoformat(), 
               "timestamp_num" : int(timestamp.timestamp()*1000),
               "netid" : netid, 
               "event" : event, 
               "sessionminutes" : elapsedTime.seconds/60 if elapsedTime is not None else None, 
               "data" : line.strip() 
              }
        print(log)
        # UNCOMMENT THIS LINE TO SHIP LOGS TO KAFKA
        toKafka(producer, "jupyterhub",log)
        # UNCOMMENT THIS LINE TO SHIP LOGS TO ELASTIC
        # toElastic("jupyterhub",log)

{'level': 'INFO', 'timestamp_str': '2021-11-20T17:37:32.613000', 'timestamp_num': 1637429852613, 'netid': None, 'event': None, 'sessionminutes': None, 'data': '[I 2021-11-20 17:37:32.613 JupyterHub app:2459] Running JupyterHub version 1.4.2'}
{"level": "INFO", "timestamp_str": "2021-11-20T17:37:32.613000", "timestamp_num": 1637429852613, "netid": null, "event": null, "sessionminutes": null, "data": "[I 2021-11-20 17:37:32.613 JupyterHub app:2459] Running JupyterHub version 1.4.2"}
{'level': 'INFO', 'timestamp_str': '2021-11-20T17:37:32.613000', 'timestamp_num': 1637429852613, 'netid': None, 'event': None, 'sessionminutes': None, 'data': '[I 2021-11-20 17:37:32.613 JupyterHub app:2489] Using Authenticator: oauthenticator.generic.GenericOAuthenticator-14.1.0'}
{"level": "INFO", "timestamp_str": "2021-11-20T17:37:32.613000", "timestamp_num": 1637429852613, "netid": null, "event": null, "sessionminutes": null, "data": "[I 2021-11-20 17:37:32.613 JupyterHub app:2489] Using Authenticator: oa

KeyboardInterrupt: 

%6|1650898258.884|FAIL|rdkafka#producer-1| [thrd:broker:29092/bootstrap]: broker:29092/1: Disconnected (after 2122451ms in state UP)
