In [124]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
import json
from os import listdir
from os.path import isfile, join
import platform
import pandas as pd
from itertools import chain

In [107]:
LOGS_DIR_PATH = "" # path to dir with log files (only)
MSG_TYPES = ['callListUpdate', 'rosterUpdate','callInfoUpdate']

In [108]:
def get_absolute_path(log_file):
    paths = {
        "Linux" : LOGS_DIR_PATH + "/" + log_file,
        "Windows" : LOGS_DIR_PATH + "\\" + log_file
    }
    return paths.get(platform.system(), '')

In [109]:
def transform_msg(message):
    msg = message[0]
    date = message[1]
    msg["date"] = date
    return msg

In [110]:
def transform_call_list_or_roster_update(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    updates = []
    for update in msg["updates"]:
        update["type"] = msg_type
        update["messageId"] = msg_id
        update["subscriptionIndex"] = subscriptionIndex
        update["date"] = date
        update["bridge_num"] = bridge_num
        updates.append(update)
    return updates

In [111]:
def transform_call_info(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    call_info = msg["callInfo"]
    call_info["type"] = msg_type
    call_info["messageId"] = msg_id
    call_info["subscriptionIndex"] = subscriptionIndex
    call_info["date"] = date
    call_info["bridge_num"] = bridge_num
    return [call_info]

In [112]:
def transform(msg, date, bridge_num):
    transformation = {
        "callInfoUpdate" : lambda x: transform_call_info(x, date, bridge_num),
        "rosterUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num),
        "callListUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num)
    }
    return transformation[msg["type"]](msg)

In [113]:
def preprocess_data(data, bridge_num):
    messages = [transform(el["message"], el["date"], bridge_num) for el in data if el["type"] == "message" and el["message"]["type"] in MSG_TYPES]
    return list(chain(*messages))

In [114]:
def join_all_logs(files):
    all_data = []
    
    for i,log_file in enumerate(files):
        with open(log_file) as f:
            data = json.load(f)
            messages = preprocess_data(data, i)
            all_data.extend(messages)
            print(len(data))
    
    print(len(all_data))
    return all_data

In [115]:
log_files = [get_absolute_path(file) for file in listdir(LOGS_DIR_PATH)]
data = join_all_logs(log_files)
data

55403
52051
65893
55759
49699


[{'call': '835b808c-eb3d-4b6f-b82b-c312eb86cba0',
  'updateType': 'add',
  'callCorrelator': 'ffe3cfd2-deb1-4050-9133-f02e57af45b6',
  'name': '[PWSZ] Elektryczność i elektronika (GZ-C)',
  'participants': 0,
  'distributedInstances': 0,
  'streaming': 'inactive',
  'recording': 'active',
  'endpointRecording': 'inactive',
  'lockState': 'locked',
  'callType': 'coSpace',
  'type': 'callListUpdate',
  'messageId': 3,
  'subscriptionIndex': 3,
  'date': '2020-06-02T09:17:55.428965',
  'bridge_num': 0},
 {'participant': '596ffb52-b285-4e55-b812-d718b8021cb4',
  'updateType': 'add',
  'name': 'Przemysław Sułek',
  'uri': 'guest265363688@meet.mche.edu.pl',
  'state': 'initial',
  'direction': 'outgoing',
  'canMove': False,
  'audioMuted': False,
  'videoMuted': False,
  'importance': None,
  'layout': 'onePlusFive',
  'activeSpeaker': False,
  'presenter': False,
  'endpointRecording': 'inactive',
  'type': 'rosterUpdate',
  'messageId': 6,
  'subscriptionIndex': 1,
  'date': '2020-06-02T

In [120]:
df_info = pd.DataFrame([el for el in data if el["type"] == "callInfoUpdate"])
df_list = pd.DataFrame([el for el in data if el["type"] == "callListUpdate"])
df_roster = pd.DataFrame([el for el in data if el["type"] == "rosterUpdate"])

In [125]:
# conf = SparkConf().setAppName("LogsAnalysis").setMaster("local[*]")
# sc = SparkContext.getOrCreate(conf=conf)
# ss = SparkSession.builder.getOrCreate()
# sqlc = SQLContext(sc)

In [180]:
df_list

Unnamed: 0,call,updateType,callCorrelator,name,participants,distributedInstances,streaming,recording,endpointRecording,lockState,callType,type,messageId,subscriptionIndex,date,bridge_num,reason
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,add,ffe3cfd2-deb1-4050-9133-f02e57af45b6,[PWSZ] Elektryczność i elektronika (GZ-C),0.0,0.0,inactive,active,inactive,locked,coSpace,callListUpdate,3,3,2020-06-02T09:17:55.428965,0,
1,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,,,,1.0,,,,,,callListUpdate,8,3,2020-06-02T09:17:55.553975,0,
2,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,,,,2.0,,,,,,callListUpdate,10,3,2020-06-02T09:17:55.571976,0,
3,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,,,,3.0,,,,notLocked,,callListUpdate,12,3,2020-06-02T09:17:55.588977,0,
4,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,,,2.0,,,,,,,callListUpdate,14,3,2020-06-02T09:17:55.762990,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4591,107ff145-3c39-43dd-becd-c272a6bd2827,update,,,,,,inactive,,,,callListUpdate,20,3,2020-06-09T15:53:11.608802,3,
4592,107ff145-3c39-43dd-becd-c272a6bd2827,update,,,2.0,,,,,,,callListUpdate,22,3,2020-06-09T15:53:11.610804,3,
4593,107ff145-3c39-43dd-becd-c272a6bd2827,add,4b44f254-ce83-41e6-abf1-30d58c019692,Dawid Suder - wirtualny pokój,2.0,0.0,inactive,inactive,inactive,notLocked,coSpace,callListUpdate,3,3,2020-06-09T15:53:14.897030,3,
4594,107ff145-3c39-43dd-becd-c272a6bd2827,update,,,1.0,,,,,,,callListUpdate,12,3,2020-06-09T15:53:19.552363,3,


In [129]:
calls = list(pd.unique(df_list["call"]))
call_correlators = list(pd.unique(df_list["callCorrelator"]))
list(set(calls).intersection(call_correlators))

[]

In [154]:
dfs_calls = []

for call in calls:
    call_data = df_list[df_list["call"] == call].sort_values(by=["date"])[["call", "updateType", "name", "participants", "streaming", "recording", "date", "reason"]]
    call_data["name"] = call_data["name"].fillna(method="ffill")
    dfs_calls.append(call_data)

In [161]:
dfs_calls[0]

Unnamed: 0,call,updateType,name,participants,streaming,recording,date,reason
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,add,[PWSZ] Elektryczność i elektronika (GZ-C),0.0,inactive,active,2020-06-02T09:17:55.428965,
1,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.553975,
2,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.571976,
3,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.588977,
4,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),2.0,,,2020-06-02T09:17:55.762990,
5,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),3.0,,,2020-06-02T09:17:55.925014,
6,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),9.0,,,2020-06-02T09:17:55.961013,
7,835b808c-eb3d-4b6f-b82b-c312eb86cba0,add,[PWSZ] Elektryczność i elektronika (GZ-C),9.0,inactive,active,2020-06-02T09:18:24.821187,
8,835b808c-eb3d-4b6f-b82b-c312eb86cba0,remove,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:18:39.819324,allLeft


In [162]:
dfs_calls[3]

Unnamed: 0,call,updateType,name,participants,streaming,recording,date,reason
68,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:27:44.618918,
69,626fb875-1c90-4cf2-ae70-c99367fe19df,update,[PK] Budownictwo (GZ-B),,,,2020-06-04T07:27:44.788930,
70,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:28:04.009397,
71,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:28:46.106561,
72,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:29:28.211744,
73,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:30:10.312921,
74,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:30:52.417108,
75,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:31:34.517293,
76,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:32:16.618464,
77,626fb875-1c90-4cf2-ae70-c99367fe19df,remove,[PK] Budownictwo (GZ-B),,,,2020-06-04T07:32:17.462525,allLeft


In [159]:
all_calls = pd.concat(dfs_calls)
all_calls

Unnamed: 0,call,updateType,name,participants,streaming,recording,date,reason
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,add,[PWSZ] Elektryczność i elektronika (GZ-C),0.0,inactive,active,2020-06-02T09:17:55.428965,
1,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.553975,
2,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.571976,
3,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.588977,
4,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),2.0,,,2020-06-02T09:17:55.762990,
...,...,...,...,...,...,...,...,...
4591,107ff145-3c39-43dd-becd-c272a6bd2827,update,Dawid Suder - wirtualny pokój,,,inactive,2020-06-09T15:53:11.608802,
4592,107ff145-3c39-43dd-becd-c272a6bd2827,update,Dawid Suder - wirtualny pokój,2.0,,,2020-06-09T15:53:11.610804,
4593,107ff145-3c39-43dd-becd-c272a6bd2827,add,Dawid Suder - wirtualny pokój,2.0,inactive,inactive,2020-06-09T15:53:14.897030,
4594,107ff145-3c39-43dd-becd-c272a6bd2827,update,Dawid Suder - wirtualny pokój,1.0,,,2020-06-09T15:53:19.552363,


In [158]:
all_calls.to_csv("all_calls.csv", index=None)

In [164]:
df_roster

Unnamed: 0,participant,updateType,name,uri,state,direction,canMove,audioMuted,videoMuted,importance,...,presenter,endpointRecording,type,messageId,subscriptionIndex,date,bridge_num,reason,movedParticipant,movedParticipantCallBridge
0,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,outgoing,False,False,False,,...,False,inactive,rosterUpdate,6,1,2020-06-02T09:17:55.478968,0,,,
1,be4da92e-43c7-4400-b801-1e975bce91cb,add,s.porebski,s.porebski@meet.mche.edu.pl,connected,outgoing,False,False,False,,...,False,inactive,rosterUpdate,15,1,2020-06-02T09:17:55.763991,0,,,
2,2ce58d89-b582-43fe-b790-3f1800011d4a,add,Mateusz Kącki,guest2973839663@meet.mche.edu.pl,connected,outgoing,False,False,False,,...,False,inactive,rosterUpdate,15,1,2020-06-02T09:17:55.763991,0,,,
3,ba63969a-2c0e-49a9-9f4b-5fb174e56e6c,add,j.cabaj,j.cabaj@meet.mche.edu.pl,connected,outgoing,False,False,False,,...,False,inactive,rosterUpdate,18,1,2020-06-02T09:17:55.926011,0,,,
4,d027781d-5ff9-4a7c-97e5-b4da23e0ccf3,add,Tomasz Cąber,guest68489067@meet.mche.edu.pl,initial,outgoing,False,False,False,,...,False,inactive,rosterUpdate,18,1,2020-06-02T09:17:55.926011,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40832,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,outgoing,True,False,False,,...,False,inactive,rosterUpdate,6,1,2020-06-09T15:53:14.901030,3,,e0bfff2a-ee7f-0000-3e81-8a9cca550000,ab000000-0000-0000-2070-43aeca550000
40833,70159f3f-2fa4-4fd1-ab5b-b2c44a4893a1,update,,,,,,,,,...,,,rosterUpdate,8,1,2020-06-09T15:53:15.989108,3,,,
40834,70159f3f-2fa4-4fd1-ab5b-b2c44a4893a1,update,,,,,,,,,...,,,rosterUpdate,9,1,2020-06-09T15:53:16.982176,3,,,
40835,70159f3f-2fa4-4fd1-ab5b-b2c44a4893a1,update,,,,,,,,,...,,,rosterUpdate,10,1,2020-06-09T15:53:19.549360,3,,,


In [174]:
participants = pd.unique(df_roster["participant"])

dfs_roster = []

for p in participants:
    df_part = df_roster[df_roster["participant"] == p].sort_values(by=["date"])[["participant", "updateType", "name", "uri", "state", "activeSpeaker", "presenter"]]
    df_part["name"] = df_part["name"].fillna(method="ffill")
    dfs_roster.append(df_part)

In [179]:
dfs_roster[0]

Unnamed: 0,participant,updateType,name,uri,state,activeSpeaker,presenter
0,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
7756,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
31237,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
16303,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
31248,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
16314,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
11,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
7767,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
23,596ffb52-b285-4e55-b812-d718b8021cb4,remove,Przemysław Sułek,,,,
16316,596ffb52-b285-4e55-b812-d718b8021cb4,remove,Przemysław Sułek,,,,


In [181]:
dfs_roster[10]

Unnamed: 0,participant,updateType,name,uri,state,activeSpeaker,presenter
7744,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,add,k.kolin,k.kolin@meet.mche.edu.pl,initial,False,False
16291,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,add,k.kolin,k.kolin@meet.mche.edu.pl,initial,False,False
31235,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,add,k.kolin,k.kolin@meet.mche.edu.pl,initial,False,False
7745,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,update,k.kolin,,connected,,
31236,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,update,k.kolin,,connected,,
...,...,...,...,...,...,...,...
219,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,update,k.kolin,,initial,,
31455,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,update,k.kolin,,initial,,
16520,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,remove,k.kolin,,,,
220,7bd8b3fa-c8df-4c9f-9ac2-fa2e946590d6,remove,k.kolin,,,,


In [183]:
all_participants = pd.concat(dfs_roster)
all_participants

Unnamed: 0,participant,updateType,name,uri,state,activeSpeaker,presenter
0,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
7756,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
31237,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
16303,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
31248,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,False,False
...,...,...,...,...,...,...,...
40792,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,False,False
40797,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,False,False
40808,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,False,False
40816,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,False,False


In [184]:
all_participants.to_csv("all_participants.csv", index=None)

In [185]:
df_info

Unnamed: 0,name,participants,streaming,recording,endpointRecording,joinAudioMuteOverride,lockState,callType,callCorrelator,distributedInstances,type,messageId,subscriptionIndex,date,bridge_num
0,[PWSZ] Elektryczność i elektronika (GZ-C),0.0,inactive,active,inactive,,locked,coSpace,ffe3cfd2-deb1-4050-9133-f02e57af45b6,0.0,callInfoUpdate,7,2,2020-06-02T09:17:55.479968,0
1,,,,,,,,,,1.0,callInfoUpdate,9,2,2020-06-02T09:17:55.554973,0
2,,,,,,,,,,2.0,callInfoUpdate,11,2,2020-06-02T09:17:55.572975,0
3,,,,,,,notLocked,,,3.0,callInfoUpdate,13,2,2020-06-02T09:17:55.589977,0
4,,2.0,,,,,,,,,callInfoUpdate,16,2,2020-06-02T09:17:55.764991,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4261,Dawid Suder - wirtualny pokój,3.0,inactive,active,inactive,,notLocked,coSpace,4b44f254-ce83-41e6-abf1-30d58c019692,0.0,callInfoUpdate,7,2,2020-06-09T15:52:32.811078,3
4262,,,,inactive,,,,,,,callInfoUpdate,21,2,2020-06-09T15:53:11.610804,3
4263,,2.0,,,,,,,,,callInfoUpdate,24,2,2020-06-09T15:53:11.612801,3
4264,Dawid Suder - wirtualny pokój,2.0,inactive,inactive,inactive,,notLocked,coSpace,4b44f254-ce83-41e6-abf1-30d58c019692,0.0,callInfoUpdate,7,2,2020-06-09T15:53:14.902031,3
