<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
import json
from os import listdir
from os.path import isfile, join
import platform
import pandas as pd
from itertools import chain

In [None]:
LOGS_DIR_PATH = "" # path to dir with log files (only)
MSG_TYPES = ['callListUpdate', 'rosterUpdate','callInfoUpdate']

In [None]:
def get_absolute_path(log_file):
    paths = {
        "Linux" : LOGS_DIR_PATH + "/" + log_file,
        "Windows" : LOGS_DIR_PATH + "\\" + log_file
    }
    return paths.get(platform.system(), '')

In [None]:
def transform_msg(message):
    msg = message[0]
    date = message[1]
    msg["date"] = date
    return msg

In [None]:
def transform_call_list_or_roster_update(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    updates = []
    for update in msg["updates"]:
        update["type"] = msg_type
        update["messageId"] = msg_id
        update["subscriptionIndex"] = subscriptionIndex
        update["date"] = date
        update["bridge_num"] = bridge_num
        updates.append(update)
    return updates

In [None]:
def transform_call_info(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    call_info = msg["callInfo"]
    call_info["type"] = msg_type
    call_info["messageId"] = msg_id
    call_info["subscriptionIndex"] = subscriptionIndex
    call_info["date"] = date
    call_info["bridge_num"] = bridge_num
    return [call_info]

In [None]:
def transform(msg, date, bridge_num):
    transformation = {
        "callInfoUpdate" : lambda x: transform_call_info(x, date, bridge_num),
        "rosterUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num),
        "callListUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num)
    }
    return transformation[msg["type"]](msg)

In [None]:
def preprocess_data(data, bridge_num):
    messages = [transform(el["message"], el["date"], bridge_num) for el in data if el["type"] == "message" and el["message"]["type"] in MSG_TYPES]
    return list(chain(*messages))

In [None]:
def join_all_logs(files):
    all_data = []
    
    for i,log_file in enumerate(files):
        with open(log_file) as f:
            data = json.load(f)
            messages = preprocess_data(data, i)
            all_data.extend(messages)
            print(len(data))
    
    print(len(all_data))
    return all_data

In [None]:
log_files = [get_absolute_path(file) for file in listdir(LOGS_DIR_PATH)]
data = join_all_logs(log_files)
data

In [None]:
df_info = pd.DataFrame([el for el in data if el["type"] == "callInfoUpdate"])
df_list = pd.DataFrame([el for el in data if el["type"] == "callListUpdate"])
df_roster = pd.DataFrame([el for el in data if el["type"] == "rosterUpdate"])

In [None]:
# conf = SparkConf().setAppName("LogsAnalysis").setMaster("local[*]")
# sc = SparkContext.getOrCreate(conf=conf)
# ss = SparkSession.builder.getOrCreate()
# sqlc = SQLContext(sc)

In [None]:
df_list

In [None]:
calls = list(pd.unique(df_list["call"]))
call_correlators = list(pd.unique(df_list["callCorrelator"]))
list(set(calls).intersection(call_correlators))

In [None]:
dfs_calls = []

for call in calls:
    call_data = df_list[df_list["call"] == call].sort_values(by=["date"])[["call", "updateType", "name", "participants", "streaming", "recording", "date", "reason"]]
    call_data["name"] = call_data["name"].fillna(method="ffill")
    dfs_calls.append(call_data)

In [None]:
dfs_calls[0]

In [None]:
dfs_calls[3]

In [None]:
all_calls = pd.concat(dfs_calls)
all_calls

In [None]:
all_calls.to_csv("all_calls.csv", index=None)

In [None]:
df_roster

In [None]:
participants = pd.unique(df_roster["participant"])

dfs_roster = []

for p in participants:
    df_part = df_roster[df_roster["participant"] == p].sort_values(by=["date"])[["participant", "updateType", "name", "uri", "state", "activeSpeaker", "presenter", "date"]]
    df_part["name"] = df_part["name"].fillna(method="ffill")
    dfs_roster.append(df_part)

In [None]:
dfs_roster[0]

In [None]:
dfs_roster[10]

In [None]:
all_participants = pd.concat(dfs_roster)
all_participants

In [None]:
all_participants.to_csv("all_participants.csv", index=None)

In [None]:
df_info