In [None]:
with open("/home/jovyan/datasets/jupyterhub/hub.log","r") as f:
    lines = f.readlines()

linecount = len(lines)
errorcount = 0
for line in lines:
    if line.startswith("[E"):
        print(line.strip())
        errorcount +=1
        
print(linecount,errorcount,errorcount/linecount)

In [None]:
from datetime import datetime 
from typing import List
def extractTimeStamp(tokens :List[str]):
    '''
    Assuming line has valid timestamp in the first place...
    line = "[I 2021-11-20 18:05:06.587 JupyterHub log:189] "
    '''
    try:
        extractedDateTimeString  = tokens[1] + " " + tokens[2]
        extractedDateTime = datetime.fromisoformat(extractedDateTimeString)
        return extractedDateTime
    except (IndexError, ValueError) as e:
        return None
    
assert extractTimeStamp("[I 2021-11-20 18:05:06.587 JupyterHub log:189] ".split())==datetime(2021, 11, 20, 18, 5, 6, 587000)
assert extractTimeStamp("[I 20211120 18:05:06.587 JupyterHub log:189] ".split())==None


In [None]:
def extractLevel(tokens : List[str]):
    if len(tokens) == 0: return None
    if tokens[0].startswith("[I"):
        level = "INFO"
    elif tokens[0].startswith("[E"):
        level = "ERROR"
    elif tokens[0].startswith("[W"):
        level = "WARN"
    else:
        level = None
    return level 

assert extractLevel("[I 2021-11-20 18:05:06.587 JupyterHub log:189] ".split()) == "INFO"
assert extractLevel("[E 2021-11-20 18:05:06.587 JupyterHub log:189] ".split()) == "ERROR"
assert extractLevel("[W 2021-11-20 18:05:06.587 JupyterHub log:189] ".split()) == "WARN"
assert extractLevel("2021-11-20 18:05:06.587 JupyterHub log:189] ".split()) == None

In [None]:
import re
import string 

def extractNetid(line :str):
    '''
    Several cases to yank out a netid from a line
    
    generic searches happen last
    '''
    searches = [
        { "pattern": r"User \w+ took \d+.\d+ seconds to start", "split": " ", "index": 1 },
        { "pattern": r"User \w+ server took \d+.\d+ seconds to stop", "split": " ", "index": 1 },   
        { "pattern": r"User \w+\: server is slow to stop", "split": " ", "index": 1 },           
        { "pattern" : r"\w+ still running$", "split": " ","index": 0 },
        { "pattern": r"Removing user \w+ from proxy", "split": " ", "index": 2 },
        { "pattern": r"/hub/api/users/\w+", "split": "/", "index": -1 },
        { "pattern": r"Adding user \w+ to proxy", "split": " ", "index": 2 },
        { "pattern": r"Server \w+ is ready", "split": " ", "index": 1 },
        { "pattern": r"\w+ is pending spawn", "split": " ", "index": 0 },
        { "pattern": r"User logged \w+: \w+", "split": " ", "index": -1 },
        { "pattern": r"Culling server \w+", "split": " ", "index": -1 },
        { "pattern": r"Failed to cleanup \w+", "split": " ", "index": -1 },
        { "pattern": r"\w+ does not appear to be running", "split": " ", "index": 0 },

        { "pattern": r"claim-\w+", "split": "-", "index": -1 },
        { "pattern": r"jupyter\-\w+", "split": "-", "index": -1 },
        { "pattern": r"jupyterhub-user-\w+", "split": "-", "index": -1 },
        { "pattern": r"/user/\w+", "split": "/", "index": -1 },
        { "pattern": r"%2Fuser%2F\w+", "split": "%2F", "index": -1 },
        { "pattern": r"\w+@\d+.\d+.\d+.\d+", "split": "@", "index": 0 }        
    ]
    netid = None
    for search in searches:
        match = re.search(search["pattern"],line)
        if match:
            extract = line[match.start():match.end()].split(search["split"])[search["index"]]
            netid = re.sub(r'[^\w\s]', '', extract)
            break
    
    return netid
    
                                                                
assert extractNetid("[I 2021-11-20 17:37:33.081 JupyterHub app:2186] yhuan161 still running")=='yhuan161'
assert extractNetid("[I 2021-11-20 17:37:45.659 JupyterHub proxy:309] Removing user amccaffe from proxy (/user/amccaffe/)")=="amccaffe"
assert extractNetid("[I 2021-11-20 17:38:13.279 JupyterHub spawner:2620] Deleting pod jupyterhub/jupyter-cma11")=="cma11"
assert extractNetid("[I 2021-11-20 17:38:24.712 JupyterHub log:189] 200 POST /hub/api/users/mafudge/activity (mafudge@10.42.4.154) 37.28ms") == "mafudge"
assert extractNetid("[I 2021-11-20 17:38:41.143 JupyterHub log:189] 200 GET /hub/error/503?url=%2Fuser%2Fcma110%2Fapi%2Fkernels%2Fd29763ae-64e9-4e58-ad2f-3b547108ec73%2Fchannels%3Fsession_id%3De4af78f1784e49db807c3bbb91dc00b5 (@10.42.11.53) 6.13ms") == "cma110"
assert extractNetid("[I 2021-11-20 17:39:22.077 JupyterHub log:189] 302 GET /hub/ -> /hub/spawn (enbilyns@45.47.85.5) 24.18ms")=='enbilyns'
assert extractNetid("[I 2021-11-20 17:39:26.137 JupyterHub spawner:2361] PVC claim-enbilyns already exists, so did not create new pvc")=='enbilyns'
assert extractNetid("[I 2021-11-20 17:39:26.137 JupyterHub spawner:2361] PVC claim-enbilyns already exists, so did not create new pvc")=='enbilyns'
assert extractNetid("[W 2021-11-20 22:17:53.102 JupyterHub base:1148] User dlnosky: server is slow to stop (timeout=10)")=='dlnosky'
assert extractNetid("[I 2021-11-20 22:17:53.102 JupyterHub log:189] 202 DELETE /hub/api/users/dlnosky/server (cull-idle@127.0.0.1) 10015.55ms")=='dlnosky'
assert extractNetid("[I 2021-11-20 22:17:58.573 JupyterHub base:1115] User dlnosky server took 15.472 seconds to stop")=='dlnosky'



In [None]:
# TODO Extract Netid and also the type of Action label START, STOP, CULL, LOGIN, PROXY, ADMIN 


In [None]:
txt = "[W 2021-11-20 22:17:53.102 JupyterHub base:1148] User dlnosky: server is slow to stop (timeout=10)"
x = re.search("User \w+\: server is slow to stop", txt)
print(x)

In [None]:
user = "dlnosky"
for line in lines:
    if line.find(user) >= 0:
        tokens = line.strip().split()
        level = extractLevel(tokens)
        timestamp = extractTimeStamp(tokens)
        netid = extractNetid(line)
        if level is not None and timestamp is not None:
            print(f"[{level}] {timestamp} {netid} DATA={line.strip()}")

In [None]:
# Sessions
user = "dlnosky"
events_to_track =  { "LOGIN": 'User logged in:' , "START" :  "seconds to start", "STOP":"seconds to stop", "CULL":"Culling server" } #, "ACTIVITY":f"/hub/api/users/{user}/activity"}
sessionStartTimestamp = None 
for line in lines:
    if line.startswith("[I") and line.find(user) >= 0:
        event = None
        elapsedTime = None
        if line.find(events_to_track["LOGIN"])>=0:
            event="LOGIN"
            loginTimestamp = extractTimeStamp(line)
        elif line.find(events_to_track["START"])>=0:
            event="START"
            sessionStartTimestamp = extractTimeStamp(line)
        elif line.find(events_to_track["STOP"])>=0:
            event="STOP"
            sessionStopTimestamp = extractTimeStamp(line)
            elapsedTime = sessionStopTimestamp - sessionStartTimestamp if sessionStartTimestamp is not None else None
        elif line.find(events_to_track["CULL"])>=0:
            event="CULL"
            
        if event is not None:
            print(f"[** {event:8} **] {line.strip()}")
        if elapsedTime is not None:
            print(f"SESSION LENGTH: {elapsedTime}")
            
            
            
        

In [None]:
TODO: Exercise with line by line feeding to topic.
INFO|WARN|ERROR datetimestamp username data

- KSQL make a stream
- KSQL Code to find the START sessions
- KSQL Code to find the CULL sessions
- KSQL to count the errors each hour 

In [None]:
user = "dlnosky"
skiplines = ['[W ','[E ','/hub/oauth_login',f'/hub/api/users/{user}/activity','hub/api/authorizations/token/']

events_to_track = [ { "LOGIN", 'User logged in:' } , {"START","seconds to start"}, {"STOP","seconds to stop"}, {"CULL","Culling server"},{"ACTIVITY",f"/hub/api/users/{user}/activity"}]

for line in lines:
    skip = False
    if line.find(user) >= 0:
        if line.find("User logged in:") >=0:
            char = "[LOGIN] =======> "
        elif line.find("seconds to start") >=0:
            char = "[START] =======> "
        elif line.find("seconds to stop") >=0:
            char = "[STOP ] =======> "
        elif line.find("Culling server") >=0:
            char = "[CULL ] =======> "
        else:
            char = ""
            
        for skipline in skiplines:
            if line.find(skipline) >=0:
                skip = True

        if not skip:
            print(char+ line.strip())
        

In [1]:
import pyspark
from pyspark.sql import SparkSession
import json
from confluent_kafka import Consumer

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

22/04/20 00:02:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
consumer = Consumer({'bootstrap.servers' : 'broker:29092', 'group.id' : '*'})
consumer.subscribe(["weblogs"])
count = 0
maxcount = 10
df = None
try:
    while True:
        msg = consumer.poll(1.0)

        if msg is None:
            continue
        if msg.error():
            print(f"Consumer error: {msg.error()}")
            continue

        raw = msg.value().decode('utf-8')
        payload = json.loads(raw)
        row = spark.createDataFrame([payload])
        if df is None:
            df = row
        else:
            df = df.union(row)
            count = count + 1
        print(f"Received message: {payload}")
        print(count)
        if count == maxcount:
            print("***** Write to file...")
            count = 0 
            df.write.mode("append").json("file:///home/jovyan/datasets/test.json")
            
            
except KeyboardInterrupt:
    consumer.close() 

Received message: {'Uri': '/about', 'User': 'ida', 'TimeStamp': 1650412155000, 'Browser': 'chrome', 'OS': 'win'}
0
Received message: {'Uri': '/blog', 'User': 'gigi', 'TimeStamp': 1650412157000, 'Browser': 'chrome', 'OS': 'win'}
1
Received message: {'Uri': '/about', 'User': 'patty', 'TimeStamp': 1650412158000, 'Browser': 'firefox', 'OS': 'win'}
2
Received message: {'Uri': '/blog', 'User': 'tosh', 'TimeStamp': 1650412159000, 'Browser': 'firefox', 'OS': 'win'}
3
Received message: {'Uri': '/blog', 'User': 'walt', 'TimeStamp': 1650412160000, 'Browser': 'edge', 'OS': 'win'}
4
Received message: {'Uri': '/', 'User': 'lisa', 'TimeStamp': 1650412161000, 'Browser': 'chrome', 'OS': 'osx'}
5
Received message: {'Uri': '/blog', 'User': 'mike', 'TimeStamp': 1650412163000, 'Browser': 'firefox', 'OS': 'osx'}
6
Received message: {'Uri': '/', 'User': 'vaibhav', 'TimeStamp': 1650412164000, 'Browser': 'safari', 'OS': 'osx'}
7
Received message: {'Uri': '/', 'User': 'quinn', 'TimeStamp': 1650412165000, 'Brows

                                                                                

Received message: {'Uri': '/', 'User': 'yolanda', 'TimeStamp': 1650412169000, 'Browser': 'chrome', 'OS': 'osx'}
1
Received message: {'Uri': '/', 'User': 'surah', 'TimeStamp': 1650412171000, 'Browser': 'firefox', 'OS': 'win'}
2
Received message: {'Uri': '/products', 'User': 'ida', 'TimeStamp': 1650412172000, 'Browser': 'chrome', 'OS': 'win'}
3
Received message: {'Uri': '/', 'User': 'quinn', 'TimeStamp': 1650412173000, 'Browser': 'chrome', 'OS': 'win'}
4
Received message: {'Uri': '/', 'User': 'ida', 'TimeStamp': 1650412174000, 'Browser': 'chrome', 'OS': 'win'}
5
Received message: {'Uri': '/', 'User': 'chris', 'TimeStamp': 1650412175000, 'Browser': 'chrome', 'OS': 'win'}
6
Received message: {'Uri': '/', 'User': 'fred', 'TimeStamp': 1650412176000, 'Browser': 'safari', 'OS': 'osx'}
7
Received message: {'Uri': '/', 'User': 'mike', 'TimeStamp': 1650412178000, 'Browser': 'firefox', 'OS': 'osx'}
8
Received message: {'Uri': '/', 'User': 'quinn', 'TimeStamp': 1650412179000, 'Browser': 'chrome', '

                                                                                

Received message: {'Uri': '/services', 'User': 'mike', 'TimeStamp': 1650412183000, 'Browser': 'firefox', 'OS': 'osx'}
1
Received message: {'Uri': '/', 'User': 'gigi', 'TimeStamp': 1650412185000, 'Browser': 'chrome', 'OS': 'win'}
2
Received message: {'Uri': '/blog', 'User': 'quinn', 'TimeStamp': 1650412187000, 'Browser': 'chrome', 'OS': 'win'}
3
Received message: {'Uri': '/', 'User': 'ida', 'TimeStamp': 1650412188000, 'Browser': 'chrome', 'OS': 'win'}
4
Received message: {'Uri': '/contact', 'User': 'abby', 'TimeStamp': 1650412189000, 'Browser': 'chrome', 'OS': 'osx'}
5
Received message: {'Uri': '/', 'User': 'rose', 'TimeStamp': 1650412190000, 'Browser': 'chrome', 'OS': 'osx'}
6
Received message: {'Uri': '/', 'User': 'rose', 'TimeStamp': 1650412192000, 'Browser': 'chrome', 'OS': 'osx'}
7
Received message: {'Uri': '/', 'User': 'elle', 'TimeStamp': 1650412194000, 'Browser': 'chrome', 'OS': 'win'}
8
Received message: {'Uri': '/services', 'User': 'hank', 'TimeStamp': 1650412196000, 'Browser'

                                                                                

Received message: {'Uri': '/blog', 'User': 'vaibhav', 'TimeStamp': 1650412199000, 'Browser': 'safari', 'OS': 'osx'}
1
Received message: {'Uri': '/', 'User': 'bob', 'TimeStamp': 1650412201000, 'Browser': 'firefox', 'OS': 'win'}
2
Received message: {'Uri': '/', 'User': 'otto', 'TimeStamp': 1650412202000, 'Browser': 'safari', 'OS': 'osx'}
3
Received message: {'Uri': '/blog', 'User': 'xavier', 'TimeStamp': 1650412204000, 'Browser': 'chrome', 'OS': 'win'}
4
Received message: {'Uri': '/contact', 'User': 'quinn', 'TimeStamp': 1650412205000, 'Browser': 'chrome', 'OS': 'win'}
5
Received message: {'Uri': '/', 'User': 'lisa', 'TimeStamp': 1650412206000, 'Browser': 'chrome', 'OS': 'osx'}
6
Received message: {'Uri': '/contact', 'User': 'fred', 'TimeStamp': 1650412208000, 'Browser': 'safari', 'OS': 'osx'}
7
Received message: {'Uri': '/contact', 'User': 'gigi', 'TimeStamp': 1650412210000, 'Browser': 'chrome', 'OS': 'win'}
8
Received message: {'Uri': '/about', 'User': 'ida', 'TimeStamp': 1650412212000

                                                                                

Received message: {'Uri': '/about', 'User': 'karley', 'TimeStamp': 1650412215000, 'Browser': 'chrome', 'OS': 'win'}
1
Received message: {'Uri': '/blog', 'User': 'ida', 'TimeStamp': 1650412217000, 'Browser': 'chrome', 'OS': 'win'}
2
Received message: {'Uri': '/products', 'User': 'walt', 'TimeStamp': 1650412219000, 'Browser': 'edge', 'OS': 'win'}
3
Received message: {'Uri': '/', 'User': 'elle', 'TimeStamp': 1650412220000, 'Browser': 'chrome', 'OS': 'win'}
4
Received message: {'Uri': '/', 'User': 'xavier', 'TimeStamp': 1650412222000, 'Browser': 'chrome', 'OS': 'win'}
5
Received message: {'Uri': '/', 'User': 'lisa', 'TimeStamp': 1650412223000, 'Browser': 'chrome', 'OS': 'osx'}
6
Received message: {'Uri': '/products', 'User': 'zeke', 'TimeStamp': 1650412224000, 'Browser': 'firefox', 'OS': 'win'}
7
Received message: {'Uri': '/products', 'User': 'xavier', 'TimeStamp': 1650412226000, 'Browser': 'chrome', 'OS': 'win'}
8
Received message: {'Uri': '/products', 'User': 'nancy', 'TimeStamp': 165041

                                                                                

Received message: {'Uri': '/', 'User': 'karley', 'TimeStamp': 1650412230000, 'Browser': 'chrome', 'OS': 'win'}
1
Received message: {'Uri': '/products', 'User': 'karley', 'TimeStamp': 1650412232000, 'Browser': 'chrome', 'OS': 'win'}
2
Received message: {'Uri': '/', 'User': 'otto', 'TimeStamp': 1650412234000, 'Browser': 'safari', 'OS': 'osx'}
3
Received message: {'Uri': '/', 'User': 'elle', 'TimeStamp': 1650412235000, 'Browser': 'chrome', 'OS': 'win'}
4
Received message: {'Uri': '/', 'User': 'gigi', 'TimeStamp': 1650412237000, 'Browser': 'chrome', 'OS': 'win'}
5
Received message: {'Uri': '/', 'User': 'karley', 'TimeStamp': 1650412239000, 'Browser': 'chrome', 'OS': 'win'}
6
Received message: {'Uri': '/', 'User': 'ida', 'TimeStamp': 1650412241000, 'Browser': 'chrome', 'OS': 'win'}
7
Received message: {'Uri': '/products', 'User': 'gigi', 'TimeStamp': 1650412242000, 'Browser': 'chrome', 'OS': 'win'}
8
Received message: {'Uri': '/products', 'User': 'surah', 'TimeStamp': 1650412243000, 'Browse

                                                                                

Received message: {'Uri': '/blog', 'User': 'chris', 'TimeStamp': 1650412246000, 'Browser': 'chrome', 'OS': 'win'}
1
Received message: {'Uri': '/', 'User': 'patty', 'TimeStamp': 1650412248000, 'Browser': 'firefox', 'OS': 'win'}
2
Received message: {'Uri': '/', 'User': 'devin', 'TimeStamp': 1650412249000, 'Browser': 'edge', 'OS': 'win'}
3
Received message: {'Uri': '/services', 'User': 'patty', 'TimeStamp': 1650412251000, 'Browser': 'firefox', 'OS': 'win'}
4
Received message: {'Uri': '/', 'User': 'patty', 'TimeStamp': 1650412252000, 'Browser': 'firefox', 'OS': 'win'}
5
Received message: {'Uri': '/', 'User': 'quinn', 'TimeStamp': 1650412254000, 'Browser': 'chrome', 'OS': 'win'}
6
Received message: {'Uri': '/contact', 'User': 'fred', 'TimeStamp': 1650412256000, 'Browser': 'safari', 'OS': 'osx'}
7
Received message: {'Uri': '/about', 'User': 'abby', 'TimeStamp': 1650412258000, 'Browser': 'chrome', 'OS': 'osx'}
8
Received message: {'Uri': '/services', 'User': 'chris', 'TimeStamp': 165041226000

                                                                                

Received message: {'Uri': '/', 'User': 'ida', 'TimeStamp': 1650412264000, 'Browser': 'chrome', 'OS': 'win'}
1
Received message: {'Uri': '/about', 'User': 'karley', 'TimeStamp': 1650412265000, 'Browser': 'chrome', 'OS': 'win'}
2
Received message: {'Uri': '/', 'User': 'bob', 'TimeStamp': 1650412266000, 'Browser': 'firefox', 'OS': 'win'}
3
Received message: {'Uri': '/about', 'User': 'surah', 'TimeStamp': 1650412267000, 'Browser': 'firefox', 'OS': 'win'}
4
Received message: {'Uri': '/about', 'User': 'vaibhav', 'TimeStamp': 1650412269000, 'Browser': 'safari', 'OS': 'osx'}
5
Received message: {'Uri': '/', 'User': 'chris', 'TimeStamp': 1650412271000, 'Browser': 'chrome', 'OS': 'win'}
6
Received message: {'Uri': '/', 'User': 'quinn', 'TimeStamp': 1650412273000, 'Browser': 'chrome', 'OS': 'win'}
7
Received message: {'Uri': '/products', 'User': 'quinn', 'TimeStamp': 1650412274000, 'Browser': 'chrome', 'OS': 'win'}
8
Received message: {'Uri': '/', 'User': 'fred', 'TimeStamp': 1650412275000, 'Brow

                                                                                

In [45]:
df.show()

+-------+---+-------------+---------+-------+
|Browser| OS|    TimeStamp|      Uri|   User|
+-------+---+-------------+---------+-------+
|firefox|win|1650412138000|/services|   zeke|
| safari|osx|1650412140000|   /about|   otto|
|   edge|win|1650412141000|/products|   hank|
| chrome|win|1650412142000|        /|  nancy|
|firefox|win|1650412144000| /contact|  patty|
|firefox|win|1650412145000|        /|    bob|
|firefox|win|1650412147000|        /|  surah|
|firefox|win|1650412149000|   /about|   tosh|
| safari|osx|1650412150000|        /|   otto|
| chrome|win|1650412151000|   /about| karley|
| chrome|osx|1650412153000|/services|yolanda|
+-------+---+-------------+---------+-------+



In [35]:
df = None

In [41]:
row = spark.createDataFrame([payload])
if df is None:
    df = row
else:
    df = df.union(row)
    



In [42]:
df.show()

+-------+---+-------------+---+-----+
|Browser| OS|    TimeStamp|Uri| User|
+-------+---+-------------+---+-----+
|firefox|win|1650412136000|  /|patty|
|firefox|win|1650412136000|  /|patty|
|firefox|win|1650412136000|  /|patty|
+-------+---+-------------+---+-----+



In [23]:
df = spark.createDataFrame([payload])
df2 = spark.createDataFrame([payload])

In [24]:
df = df.union(df2)
df.show()

+-------+---+-------------+---+-----+
|Browser| OS|    TimeStamp|Uri| User|
+-------+---+-------------+---+-----+
|firefox|win|1650412136000|  /|patty|
|firefox|win|1650412136000|  /|patty|
+-------+---+-------------+---+-----+

