In [1]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
import json
from os import listdir
from os.path import isfile, join
import platform
import pandas as pd
from itertools import chain

In [2]:
LOGS_DIR_PATH = "/run/media/jola/DATA/JOLEG/inzynierka/logs" # path to dir with log files (only)
MSG_TYPES = ['callListUpdate', 'rosterUpdate','callInfoUpdate']

## Basic functions for logs preprocessing

In [3]:
def get_absolute_path(log_file):
    paths = {
        "Linux" : LOGS_DIR_PATH + "/" + log_file,
        "Windows" : LOGS_DIR_PATH + "\\" + log_file
    }
    return paths.get(platform.system(), '')

In [4]:
def transform_msg(message):
    msg = message[0]
    date = message[1]
    msg["date"] = date
    return msg

In [5]:
def transform_call_list_or_roster_update(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    updates = []
    for update in msg["updates"]:
        update["type"] = msg_type
        update["messageId"] = msg_id
        update["subscriptionIndex"] = subscriptionIndex
        update["date"] = date
        update["bridge_num"] = bridge_num
        updates.append(update)
    return updates

In [6]:
def transform_call_info(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    call_info = msg["callInfo"]
    call_info["type"] = msg_type
    call_info["messageId"] = msg_id
    call_info["subscriptionIndex"] = subscriptionIndex
    call_info["date"] = date
    call_info["bridge_num"] = bridge_num
    return [call_info]

In [7]:
def transform(msg, date, bridge_num):
    transformation = {
        "callInfoUpdate" : lambda x: transform_call_info(x, date, bridge_num),
        "rosterUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num),
        "callListUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num)
    }
    return transformation[msg["type"]](msg)

In [8]:
def preprocess_data(data, bridge_num):
    messages = [transform(el["message"], el["date"], bridge_num) for el in data if el["type"] == "message" and el["message"]["type"] in MSG_TYPES]
    return list(chain(*messages))

In [9]:
def join_all_logs(files):
    all_data = []
    
    for i,log_file in enumerate(files):
        with open(log_file) as f:
            data = json.load(f)
            messages = preprocess_data(data, i)
            all_data.extend(messages)
            print(len(data))
    
    print(len(all_data))
    return all_data

## Extract all callListUpdate events and save them to file

In [10]:
def get_all_events(files):
    all_data = []
    
    for i,log_file in enumerate(files):
        with open(log_file) as f:
            data = json.load(f)
            data = [el for el in data if el["type"] == "message" and el["message"]["type"] == "callListUpdate"]
            all_data.extend(data)
            print(len(data))
    
    print(len(all_data))
    return all_data

In [11]:
log_files = [get_absolute_path(file) for file in listdir(LOGS_DIR_PATH)]
data = get_all_events(log_files)

618
1004
1634
1177
4433


In [12]:
data[0].keys()

dict_keys(['type', 'message', 'date'])

In [13]:
data[0]["message"].keys()

dict_keys(['messageId', 'type', 'subscriptionIndex', 'updates'])

In [14]:
data[0]["message"]

{'messageId': 3,
 'type': 'callListUpdate',
 'subscriptionIndex': 3,
 'updates': [{'call': '835b808c-eb3d-4b6f-b82b-c312eb86cba0',
   'updateType': 'add',
   'callCorrelator': 'ffe3cfd2-deb1-4050-9133-f02e57af45b6',
   'name': '[PWSZ] Elektryczność i elektronika (GZ-C)',
   'participants': 0,
   'distributedInstances': 0,
   'streaming': 'inactive',
   'recording': 'active',
   'endpointRecording': 'inactive',
   'lockState': 'locked',
   'callType': 'coSpace'}]}

In [15]:
data[0]

{'type': 'message',
 'message': {'messageId': 3,
  'type': 'callListUpdate',
  'subscriptionIndex': 3,
  'updates': [{'call': '835b808c-eb3d-4b6f-b82b-c312eb86cba0',
    'updateType': 'add',
    'callCorrelator': 'ffe3cfd2-deb1-4050-9133-f02e57af45b6',
    'name': '[PWSZ] Elektryczność i elektronika (GZ-C)',
    'participants': 0,
    'distributedInstances': 0,
    'streaming': 'inactive',
    'recording': 'active',
    'endpointRecording': 'inactive',
    'lockState': 'locked',
    'callType': 'coSpace'}]},
 'date': '2020-06-02T09:17:55.428965'}

In [16]:
def write_json(data, filename='data.json'): 
    with open(filename,'w') as f: 
        json.dump(data, f) 

In [17]:
write_json(data, 'calls_data.json')

## Load and preprocess all logs data

### Load all logs data

In [18]:
log_files = [get_absolute_path(file) for file in listdir(LOGS_DIR_PATH)]
data = join_all_logs(log_files)
data

55403
52051
65893
55759
49699


[{'call': '835b808c-eb3d-4b6f-b82b-c312eb86cba0',
  'updateType': 'add',
  'callCorrelator': 'ffe3cfd2-deb1-4050-9133-f02e57af45b6',
  'name': '[PWSZ] Elektryczność i elektronika (GZ-C)',
  'participants': 0,
  'distributedInstances': 0,
  'streaming': 'inactive',
  'recording': 'active',
  'endpointRecording': 'inactive',
  'lockState': 'locked',
  'callType': 'coSpace',
  'type': 'callListUpdate',
  'messageId': 3,
  'subscriptionIndex': 3,
  'date': '2020-06-02T09:17:55.428965',
  'bridge_num': 0},
 {'participant': '596ffb52-b285-4e55-b812-d718b8021cb4',
  'updateType': 'add',
  'name': 'Przemysław Sułek',
  'uri': 'guest265363688@meet.mche.edu.pl',
  'state': 'initial',
  'direction': 'outgoing',
  'canMove': False,
  'audioMuted': False,
  'videoMuted': False,
  'importance': None,
  'layout': 'onePlusFive',
  'activeSpeaker': False,
  'presenter': False,
  'endpointRecording': 'inactive',
  'type': 'rosterUpdate',
  'messageId': 6,
  'subscriptionIndex': 1,
  'date': '2020-06-02T

### Divide logs based on message type

In [19]:
df_info = pd.DataFrame([el for el in data if el["type"] == "callInfoUpdate"])
df_list = pd.DataFrame([el for el in data if el["type"] == "callListUpdate"])
df_roster = pd.DataFrame([el for el in data if el["type"] == "rosterUpdate"])

### Preprocess callInfoUpdate events

In [65]:
calls = list(pd.unique(df_list["call"]))

In [66]:
dfs_calls = []

for call in calls:
    call_data = df_list[df_list["call"] == call].sort_values(by=["date"])[["call", "updateType", "name", "participants", "streaming", "recording", "date", "reason"]]
    call_data["name"] = call_data["name"].fillna(method="ffill")
    dfs_calls.append(call_data)

In [67]:
dfs_calls[0]

Unnamed: 0,call,updateType,name,participants,streaming,recording,date,reason
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,add,[PWSZ] Elektryczność i elektronika (GZ-C),0.0,inactive,active,2020-06-02T09:17:55.428965,
1,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.553975,
2,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.571976,
3,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.588977,
4,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),2.0,,,2020-06-02T09:17:55.762990,
5,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),3.0,,,2020-06-02T09:17:55.925014,
6,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),9.0,,,2020-06-02T09:17:55.961013,
7,835b808c-eb3d-4b6f-b82b-c312eb86cba0,add,[PWSZ] Elektryczność i elektronika (GZ-C),9.0,inactive,active,2020-06-02T09:18:24.821187,
8,835b808c-eb3d-4b6f-b82b-c312eb86cba0,remove,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:18:39.819324,allLeft


In [23]:
dfs_calls[3]

Unnamed: 0,call,updateType,name,participants,streaming,recording,date,reason
68,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:27:44.618918,
69,626fb875-1c90-4cf2-ae70-c99367fe19df,update,[PK] Budownictwo (GZ-B),,,,2020-06-04T07:27:44.788930,
70,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:28:04.009397,
71,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:28:46.106561,
72,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:29:28.211744,
73,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:30:10.312921,
74,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:30:52.417108,
75,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:31:34.517293,
76,626fb875-1c90-4cf2-ae70-c99367fe19df,add,[PK] Budownictwo (GZ-B),1.0,inactive,active,2020-06-04T07:32:16.618464,
77,626fb875-1c90-4cf2-ae70-c99367fe19df,remove,[PK] Budownictwo (GZ-B),,,,2020-06-04T07:32:17.462525,allLeft


In [24]:
all_calls = pd.concat(dfs_calls)
all_calls

Unnamed: 0,call,updateType,name,participants,streaming,recording,date,reason
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,add,[PWSZ] Elektryczność i elektronika (GZ-C),0.0,inactive,active,2020-06-02T09:17:55.428965,
1,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.553975,
2,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.571976,
3,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),,,,2020-06-02T09:17:55.588977,
4,835b808c-eb3d-4b6f-b82b-c312eb86cba0,update,[PWSZ] Elektryczność i elektronika (GZ-C),2.0,,,2020-06-02T09:17:55.762990,
...,...,...,...,...,...,...,...,...
4591,107ff145-3c39-43dd-becd-c272a6bd2827,update,Dawid Suder - wirtualny pokój,,,inactive,2020-06-09T15:53:11.608802,
4592,107ff145-3c39-43dd-becd-c272a6bd2827,update,Dawid Suder - wirtualny pokój,2.0,,,2020-06-09T15:53:11.610804,
4593,107ff145-3c39-43dd-becd-c272a6bd2827,add,Dawid Suder - wirtualny pokój,2.0,inactive,inactive,2020-06-09T15:53:14.897030,
4594,107ff145-3c39-43dd-becd-c272a6bd2827,update,Dawid Suder - wirtualny pokój,1.0,,,2020-06-09T15:53:19.552363,


In [25]:
all_calls.to_csv("all_calls.csv", index=None)

### Preprocess rosterUpdate

In [26]:
participants = pd.unique(df_roster["participant"])

dfs_roster = []

for p in participants:
    df_part = df_roster[df_roster["participant"] == p].sort_values(by=["date"])[["participant",
                                                                                 "updateType", 
                                                                                 "name", 
                                                                                 "uri", 
                                                                                 "state", 
                                                                                 "direction", 
                                                                                 "movedParticipant", 
                                                                                 "movedParticipantCallBridge", 
                                                                                 "canMove", 
                                                                                 "audioMuted", 
                                                                                 "videoMuted", 
                                                                                 "importance", 
                                                                                 "layout", 
                                                                                 "activeSpeaker", 
                                                                                 "presenter",
                                                                                 "endpointRecording",
                                                                                 "date"]]
    df_part["name"] = df_part["name"].fillna(method="ffill")
    df_part["uri"] = df_part["uri"].fillna(method="ffill")
    dfs_roster.append(df_part)

In [27]:
dfs_roster[60]

Unnamed: 0,participant,updateType,name,uri,state,direction,movedParticipant,movedParticipantCallBridge,canMove,audioMuted,videoMuted,importance,layout,activeSpeaker,presenter,endpointRecording,date
2671,7e1dc01e-d859-4ce6-88a0-23cd3f445424,add,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,connected,incoming,,,True,False,False,,allEqual,False,False,inactive,2020-06-08T09:02:36.818768
2685,7e1dc01e-d859-4ce6-88a0-23cd3f445424,update,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,,,,,,,,,,True,,,2020-06-08T09:02:50.550814
2687,7e1dc01e-d859-4ce6-88a0-23cd3f445424,update,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,,,,,,,,,,False,,,2020-06-08T09:02:51.551891
2703,7e1dc01e-d859-4ce6-88a0-23cd3f445424,add,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,connected,incoming,,,True,False,False,,allEqual,False,False,inactive,2020-06-08T09:03:18.907962
2729,7e1dc01e-d859-4ce6-88a0-23cd3f445424,add,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,connected,incoming,,,True,False,False,,allEqual,False,False,inactive,2020-06-08T09:04:00.999166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6151,7e1dc01e-d859-4ce6-88a0-23cd3f445424,update,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,,,,,,,,,,True,,,2020-06-08T10:53:43.724489
6158,7e1dc01e-d859-4ce6-88a0-23cd3f445424,update,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,,,,,,,,,,False,,,2020-06-08T10:53:45.778635
6162,7e1dc01e-d859-4ce6-88a0-23cd3f445424,update,mcdn-oswi-p1,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,onHold,,,,,,,,,,,,2020-06-08T10:53:46.587691
6164,7e1dc01e-d859-4ce6-88a0-23cd3f445424,update,,mcdn-oswi-p1@mche.edu.pl;x-cisco-number=18431,,,,,,,,,,,,,2020-06-08T10:53:46.664705


In [28]:
df_list[(df_list["date"]>"2020-06-02T09:17:20.002731")&(df_list["date"]<"2020-06-02T09:17:27.002731")]

Unnamed: 0,call,updateType,callCorrelator,name,participants,distributedInstances,streaming,recording,endpointRecording,lockState,callType,type,messageId,subscriptionIndex,date,bridge_num,reason
3536,2abbb064-9e7d-4c67-9217-2fd2d43de646,add,ffe3cfd2-deb1-4050-9133-f02e57af45b6,[PWSZ] Elektryczność i elektronika (GZ-C),8.0,2.0,inactive,active,inactive,notLocked,coSpace,callListUpdate,3,3,2020-06-02T09:17:23.947581,3,


In [29]:
meeting = df_list[df_list["call"]=="2abbb064-9e7d-4c67-9217-2fd2d43de646"].sort_values(by=["date"])

In [30]:
pd.unique(meeting["participants"])

array([ 3.,  4.,  5.,  6.,  7.,  8.,  9., nan, 10., 11., 12., 13., 14.,
       15.])

In [31]:
all_participants = pd.concat(dfs_roster)
all_participants

Unnamed: 0,participant,updateType,name,uri,state,direction,movedParticipant,movedParticipantCallBridge,canMove,audioMuted,videoMuted,importance,layout,activeSpeaker,presenter,endpointRecording,date
0,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,outgoing,,,False,False,False,,onePlusFive,False,False,inactive,2020-06-02T09:17:55.478968
7756,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,outgoing,,,False,False,False,,onePlusFive,False,False,inactive,2020-06-02T09:17:59.659284
31237,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,outgoing,,,False,False,False,,onePlusFive,False,False,inactive,2020-06-02T09:17:59.659284
16303,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,outgoing,,,False,False,False,,onePlusFive,False,False,inactive,2020-06-02T09:17:59.662285
31248,596ffb52-b285-4e55-b812-d718b8021cb4,add,Przemysław Sułek,guest265363688@meet.mche.edu.pl,initial,outgoing,,,False,False,False,,onePlusFive,False,False,inactive,2020-06-02T09:18:06.044766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40792,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,outgoing,e0bfff2a-ee7f-0000-3e81-8a9cca550000,ab000000-0000-0000-2070-43aeca550000,True,False,False,,allEqual,False,False,inactive,2020-06-09T15:50:26.487893
40797,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,outgoing,e0bfff2a-ee7f-0000-3e81-8a9cca550000,ab000000-0000-0000-2070-43aeca550000,True,False,False,,allEqual,False,False,inactive,2020-06-09T15:51:08.600038
40808,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,outgoing,e0bfff2a-ee7f-0000-3e81-8a9cca550000,ab000000-0000-0000-2070-43aeca550000,True,False,False,,allEqual,False,False,inactive,2020-06-09T15:51:50.711127
40816,7b275709-1ce2-4ac9-8fca-a5105fa315a1,add,,mik.sondej@free.calls.webex.com,connected,outgoing,e0bfff2a-ee7f-0000-3e81-8a9cca550000,ab000000-0000-0000-2070-43aeca550000,True,False,False,,allEqual,False,False,inactive,2020-06-09T15:52:32.811078


In [32]:
all_participants.to_csv("all_participants.csv", index=None)

### Add call id to callInfoUpdate events

#### Create dataframe with call id, name, start date and end date -> joined

In [33]:
df_info.sort_values(by=["date"], inplace=True)

In [34]:
df_info["name"] = df_info["name"].fillna(method="ffill")

In [35]:
df_names = df_list[["call", "name", "date"]]

In [36]:
df_first = df_names.drop_duplicates(subset=["call"], keep="first")

In [37]:
df_last = df_names.drop_duplicates(subset=["call"], keep="last")

In [38]:
df_first.reset_index(inplace=True)

In [39]:
df_last.reset_index(inplace=True)

In [40]:
df_first.drop(columns=["index"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [41]:
df_last.drop(columns=["index"], inplace=True)

In [42]:
len(df_first)

113

In [43]:
len(df_last)

113

In [44]:
df_first.head()

Unnamed: 0,call,name,date
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,[PWSZ] Elektryczność i elektronika (GZ-C),2020-06-02T09:17:55.428965
1,9f41f4f1-2891-42ca-9fe4-b6520a006d62,[PWSZ] Elektryczność i elektronika (GZ-C),2020-06-02T09:19:33.054350
2,1bec1a39-dcc8-4f8c-a7c8-7caadf0d0f8a,[PWSZ] Elektryczność i elektronika (GZ-C),2020-06-02T09:33:12.731069
3,626fb875-1c90-4cf2-ae70-c99367fe19df,[PK] Budownictwo (GZ-B),2020-06-04T07:27:44.618918
4,2b620603-11b4-4c76-833e-3ca16ba11222,[PK] Budownictwo (GZ-B),2020-06-04T07:33:40.823827


In [45]:
df_last.head()

Unnamed: 0,call,name,date
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,,2020-06-02T09:18:39.819324
1,9f41f4f1-2891-42ca-9fe4-b6520a006d62,,2020-06-02T09:31:40.827068
2,1bec1a39-dcc8-4f8c-a7c8-7caadf0d0f8a,,2020-06-02T09:33:41.453239
3,626fb875-1c90-4cf2-ae70-c99367fe19df,,2020-06-04T07:32:17.462525
4,811246e7-7ae0-4c41-94a0-1e32cb2081a6,,2020-06-04T07:39:47.440768


In [46]:
df_last

Unnamed: 0,call,name,date
0,835b808c-eb3d-4b6f-b82b-c312eb86cba0,,2020-06-02T09:18:39.819324
1,9f41f4f1-2891-42ca-9fe4-b6520a006d62,,2020-06-02T09:31:40.827068
2,1bec1a39-dcc8-4f8c-a7c8-7caadf0d0f8a,,2020-06-02T09:33:41.453239
3,626fb875-1c90-4cf2-ae70-c99367fe19df,,2020-06-04T07:32:17.462525
4,811246e7-7ae0-4c41-94a0-1e32cb2081a6,,2020-06-04T07:39:47.440768
...,...,...,...
108,aa525af3-0afe-4f76-9e8a-18f43e1f92dd,,2020-06-08T10:12:26.806780
109,fc211f14-6b31-422a-be1b-838a70ebd43e,,2020-06-08T12:07:49.113392
110,910c095f-1212-41aa-9239-c0ffc906e84a,,2020-06-08T12:08:13.390225
111,43c7b9dc-51bb-426a-b538-ff4a25b0eb71,,2020-06-09T08:38:06.582230


In [47]:
joined = df_first.merge(df_last, on="call", how="left")

In [48]:
joined.drop(columns=["name_y"], inplace=True)

In [49]:
joined.rename(columns={"name_x": "name", "date_x": "start_date", "date_y": "end_date"}, inplace=True)

#### Merge joined with df_info and add columns with date -> merged

In [50]:
df_info.reset_index(inplace=True)

In [51]:
df_info

Unnamed: 0,index,name,participants,streaming,recording,endpointRecording,joinAudioMuteOverride,lockState,callType,callCorrelator,distributedInstances,type,messageId,subscriptionIndex,date,bridge_num
0,1564,Łukasz Czekierda - wirtualny pokój,0.0,inactive,inactive,inactive,,locked,coSpace,bea1a8ad-9e26-48f6-842e-6f2545864347,0.0,callInfoUpdate,7,2,2020-06-02T00:42:47.985183,2
1,1565,Łukasz Czekierda - wirtualny pokój,1.0,,,,,,,,,callInfoUpdate,10,2,2020-06-02T00:42:48.286209,2
2,1566,Łukasz Czekierda - wirtualny pokój,,,,,,notLocked,,,,callInfoUpdate,12,2,2020-06-02T00:42:48.322213,2
3,587,[PWSZ] Elektryczność i elektronika (GZ-B),0.0,inactive,active,inactive,,notLocked,coSpace,a59b6967-ee91-4f85-b47c-b190ea95b0c8,0.0,callInfoUpdate,7,2,2020-06-02T07:14:15.263637,1
4,588,[PWSZ] Elektryczność i elektronika (GZ-B),1.0,,,,,,,,,callInfoUpdate,11,2,2020-06-02T07:14:16.083694,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4261,3132,Bartosz Kwolek - wirtualny pokój,4.0,,,,,,,,,callInfoUpdate,26,2,2020-06-09T18:26:54.982966,2
4262,3133,Bartosz Kwolek - wirtualny pokój,3.0,,,,,,,,,callInfoUpdate,30,2,2020-06-09T18:26:57.668168,2
4263,3134,Bartosz Kwolek - wirtualny pokój,2.0,,,,,,,,,callInfoUpdate,35,2,2020-06-09T18:27:00.432352,2
4264,3135,Bartosz Kwolek - wirtualny pokój,1.0,,,,,,,,,callInfoUpdate,39,2,2020-06-09T18:27:01.044394,2


In [52]:
merged = df_info.merge(joined, on="name", how="left")

In [53]:
from datetime import datetime

In [54]:
merged["start_date"]

0        2020-06-02T00:42:47.922179
1        2020-06-09T16:50:09.980352
2        2020-06-02T00:42:47.922179
3        2020-06-09T16:50:09.980352
4        2020-06-02T00:42:47.922179
                    ...            
22735    2020-06-09T16:50:09.588321
22736    2020-06-09T16:49:15.169233
22737    2020-06-09T16:50:09.588321
22738    2020-06-09T16:49:15.169233
22739    2020-06-09T16:50:09.588321
Name: start_date, Length: 22740, dtype: object

In [55]:
pattern = '%Y-%m-%dT%H:%M:%S.%f'
merged["start_date_con"] = merged["start_date"].apply(lambda x: datetime.strptime(x, pattern))
merged["end_date_con"] = merged["end_date"].apply(lambda x: datetime.strptime(x, pattern))
merged["date_con"] = merged["date"].apply(lambda x: datetime.strptime(x, pattern))

In [56]:
merged["start_diff"] = abs(merged["date_con"] - merged["start_date_con"])

In [57]:
merged["end_diff"] = abs(merged["end_date_con"] - merged["date_con"])

In [58]:
merged["diff"] = merged.apply(lambda x: min(x["end_diff"], x["start_diff"]), axis=1)

In [59]:
merged.head()

Unnamed: 0,index,name,participants,streaming,recording,endpointRecording,joinAudioMuteOverride,lockState,callType,callCorrelator,...,bridge_num,call,start_date,end_date,start_date_con,end_date_con,date_con,start_diff,end_diff,diff
0,1564,Łukasz Czekierda - wirtualny pokój,0.0,inactive,inactive,inactive,,locked,coSpace,bea1a8ad-9e26-48f6-842e-6f2545864347,...,2,430aa8e9-1d22-425b-9d10-ffa07e7deee7,2020-06-02T00:42:47.922179,2020-06-02T00:42:58.848011,2020-06-02 00:42:47.922179,2020-06-02 00:42:58.848011,2020-06-02 00:42:47.985183,0 days 00:00:00.063004,0 days 00:00:10.862828,0 days 00:00:00.063004
1,1564,Łukasz Czekierda - wirtualny pokój,0.0,inactive,inactive,inactive,,locked,coSpace,bea1a8ad-9e26-48f6-842e-6f2545864347,...,2,69ade151-a198-4bbe-96fa-0bded9feb887,2020-06-09T16:50:09.980352,2020-06-09T16:50:10.152364,2020-06-09 16:50:09.980352,2020-06-09 16:50:10.152364,2020-06-02 00:42:47.985183,7 days 16:07:21.995169,7 days 16:07:22.167181,7 days 16:07:21.995169
2,1565,Łukasz Czekierda - wirtualny pokój,1.0,,,,,,,,...,2,430aa8e9-1d22-425b-9d10-ffa07e7deee7,2020-06-02T00:42:47.922179,2020-06-02T00:42:58.848011,2020-06-02 00:42:47.922179,2020-06-02 00:42:58.848011,2020-06-02 00:42:48.286209,0 days 00:00:00.364030,0 days 00:00:10.561802,0 days 00:00:00.364030
3,1565,Łukasz Czekierda - wirtualny pokój,1.0,,,,,,,,...,2,69ade151-a198-4bbe-96fa-0bded9feb887,2020-06-09T16:50:09.980352,2020-06-09T16:50:10.152364,2020-06-09 16:50:09.980352,2020-06-09 16:50:10.152364,2020-06-02 00:42:48.286209,7 days 16:07:21.694143,7 days 16:07:21.866155,7 days 16:07:21.694143
4,1566,Łukasz Czekierda - wirtualny pokój,,,,,,notLocked,,,...,2,430aa8e9-1d22-425b-9d10-ffa07e7deee7,2020-06-02T00:42:47.922179,2020-06-02T00:42:58.848011,2020-06-02 00:42:47.922179,2020-06-02 00:42:58.848011,2020-06-02 00:42:48.322213,0 days 00:00:00.400034,0 days 00:00:10.525798,0 days 00:00:00.400034


In [60]:
len(df_info)

4266

In [61]:
info_preprocessed = merged.loc[merged.groupby('index').diff.idxmin()].reset_index(drop=True)

AttributeError: 'function' object has no attribute 'idxmin'

In [None]:
len(info_preprocessed)

In [None]:
info_preprocessed.drop(columns=["index", "start_date", "end_date", "start_date_con", "end_date_con", "date_con", "start_diff", "end_diff", "diff"], inplace=True)

In [None]:
dict_info_prep = info_preprocessed.to_dict("records")

In [None]:
dict_info_prep[0]

In [None]:
def transform_event_info(event):
    final_event = dict()
    final_event["type"] = "message"
    callInfo = {
        "name": event["name"],
        "participants": event["participants"],
        "streaming": event["streaming"],
        "recording": event["recording"],
        "endpointRecording": event["endpointRecording"],
        "joinAudioMuteOverride": event["joinAudioMuteOverride"],
        "lockState": event["lockState"],
        "callType": event["callType"],
        "callCorrelator": event["callCorrelator"],
        "distributedInstances": event["distributedInstances"]
    }
    final_event["message"] = {
        "messageId": 1,
        "type": "callInfoUpdate",
        "subscriptionIndex": 2,
        "callInfo": callInfo
    }
    final_event["date"] = event["date"]
    final_event["call"] = event["call"]
    return final_event

In [None]:
mapped = list(map(transform_event_info, dict_info_prep))

In [None]:
write_json(mapped, 'callInfo_data.json')

### Add call id to rosterUpdate events

In [None]:
all_participants.sort_values(by=["date"], inplace=True)

In [None]:
df_users = all_participants[["participant", "date"]]

In [None]:
part_first = df_users.drop_duplicates(subset=["participant"], keep="first")

In [None]:
part_last = df_users.drop_duplicates(subset=["participant"], keep="last")

In [None]:
joined = part_first.merge(part_last, on="participant", how="left")

In [None]:
joined.rename(columns={"date_x": "start_date", "date_y": "end_date"}, inplace=True)

In [None]:
joined

In [None]:
df_names = df_list[["call", "name", "date"]]

In [None]:
df_first = df_names.drop_duplicates(subset=["call"], keep="first")

In [None]:
df_last = df_names.drop_duplicates(subset=["call"], keep="last")

In [None]:
df_first.reset_index(inplace=True)

In [None]:
df_last.reset_index(inplace=True)

In [None]:
joined_calls = df_first.merge(df_last, on="call", how="left")

In [None]:
joined_calls.drop(columns=["name_y"], inplace=True)

In [None]:
joined_calls.rename(columns={"name_x": "name", "date_x": "start_date", "date_y": "end_date"}, inplace=True)

In [None]:
joined_calls.drop(columns=["index_x", "index_y"], inplace=True)

In [None]:
joined_calls = joined_calls[["call", "start_date", "end_date"]]

In [None]:
joined["key"] = 1

In [None]:
joined_calls["key"] = 1

In [None]:
merged = pd.merge(joined,joined_calls,on='key').drop('key',axis=1)

In [None]:
merged.rename(columns={"start_date_x":"start_date_p", "end_date_x":"end_date_p", "start_date_y":"start_date_c", "end_date_y":"end_date_c"}, inplace=True)

In [None]:
merged.columns

In [None]:
pattern = '%Y-%m-%dT%H:%M:%S.%f'
merged["start_date_p_con"] = merged["start_date_p"].apply(lambda x: datetime.strptime(x, pattern))
merged["end_date_p_con"] = merged["end_date_p"].apply(lambda x: datetime.strptime(x, pattern))
merged["start_date_c_con"] = merged["start_date_c"].apply(lambda x: datetime.strptime(x, pattern))
merged["end_date_c_con"] = merged["end_date_c"].apply(lambda x: datetime.strptime(x, pattern))

In [None]:
merged["is_between"] = merged.apply(lambda row: row["start_date_p_con"]>=row["start_date_c_con"] and row["end_date_p_con"]<=row["end_date_c_con"], axis=1)

In [None]:
df_best = merged[merged["is_between"]]

In [None]:
len(pd.unique(df_best["participant"]))

In [None]:
len(pd.unique(df_best["participant"])) == len(pd.unique(joined["participant"]))

In [None]:
rest_participants = [pid for pid in joined["participant"] if pid not in list(df_best["participant"])]

In [None]:
rest_participants

In [None]:
counted = df_best.groupby(by=["participant"]).count()

In [None]:
counted[counted["end_date_p"]>1]

In [None]:
to_add = merged[merged["participant"].isin(rest_participants)]

In [None]:
concatenated = pd.concat([df_best, to_add])

In [None]:
len(concatenated)

In [None]:
concatenated["start_diff"] = abs(concatenated["start_date_p_con"] - concatenated["start_date_c_con"])

In [None]:
concatenated["end_diff"] = abs(concatenated["end_date_p_con"] - concatenated["end_date_c_con"])

In [None]:
concatenated["diff"] = concatenated.apply(lambda x: min(x["end_diff"], x["start_diff"]), axis=1)

In [None]:
concatenated.head()

In [None]:
user_preprocessed = concatenated.loc[concatenated.groupby('participant').diff.idxmin()].reset_index(drop=True)

In [None]:
len(user_preprocessed)

In [None]:
user_preprocessed["diff"].min()

In [None]:
maks = user_preprocessed["diff"].max()

In [None]:
user_preprocessed[user_preprocessed["diff"]==maks]

In [None]:
participant_call = user_preprocessed[["participant", "call"]]

In [None]:
final = all_participants.merge(participant_call, how="left", on="participant")

In [None]:
final.sort_values(by=["date"], inplace=True)

In [None]:
final.reset_index(drop=True)

In [None]:
type("a") == str

In [None]:
def get_value(column, event):
    if not event[column] or (type(event[column])!=str and math.isnan(event[column])):
        return None
    else:
        return event[column]

In [None]:
def transform_event_roster(event):
    final_event = dict()
    final_event["type"] = "message"
    columns = ["participant", "updateType", "name", "uri", "state", "direction", "movedParticipant",
              "movedParticipantCallBridge", "canMove", "audioMuted", "videoMuted", "importance", "layout", 
              "activeSpeaker", "presenter", "endpointRecording"]
    update = dict()
    for column in columns:
        value = get_value(column, event)
        if value:
            update[column] = value
    final_event["message"] = {
        "messageId": 1,
        "type": "rosterUpdate",
        "subscriptionIndex": 1,
        "updates": [update]
    }
    final_event["date"] = event["date"]
    final_event["call"] = event["call"]
    return final_event

In [None]:
mapped = list(map(transform_event_roster, final.to_dict("records")))

In [None]:
mapped

In [None]:
write_json(mapped, 'roster_data.json')