In [None]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
import json
from os import listdir
from os.path import isfile, join
import platform
import pandas as pd
from itertools import chain

In [None]:
LOGS_DIR_PATH = "D:\\inzynierka\\logs" # path to dir with log files (only)
MSG_TYPES = ['callListUpdate', 'rosterUpdate','callInfoUpdate']

## Basic functions for logs preprocessing

In [None]:
def get_absolute_path(log_file):
    paths = {
        "Linux" : LOGS_DIR_PATH + "/" + log_file,
        "Windows" : LOGS_DIR_PATH + "\\" + log_file
    }
    return paths.get(platform.system(), '')

In [None]:
def transform_msg(message):
    msg = message[0]
    date = message[1]
    msg["date"] = date
    return msg

In [None]:
def transform_call_list_or_roster_update(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    updates = []
    for update in msg["updates"]:
        update["type"] = msg_type
        update["messageId"] = msg_id
        update["subscriptionIndex"] = subscriptionIndex
        update["date"] = date
        update["bridge_num"] = bridge_num
        updates.append(update)
    return updates

In [None]:
def transform_call_info(msg, date, bridge_num):
    msg_id = msg["messageId"]
    msg_type = msg["type"]
    subscriptionIndex = msg["subscriptionIndex"]
    call_info = msg["callInfo"]
    call_info["type"] = msg_type
    call_info["messageId"] = msg_id
    call_info["subscriptionIndex"] = subscriptionIndex
    call_info["date"] = date
    call_info["bridge_num"] = bridge_num
    return [call_info]

In [None]:
def transform(msg, date, bridge_num):
    transformation = {
        "callInfoUpdate" : lambda x: transform_call_info(x, date, bridge_num),
        "rosterUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num),
        "callListUpdate" : lambda x: transform_call_list_or_roster_update(x, date, bridge_num)
    }
    return transformation[msg["type"]](msg)

In [None]:
def preprocess_data(data, bridge_num):
    messages = [transform(el["message"], el["date"], bridge_num) for el in data if el["type"] == "message" and el["message"]["type"] in MSG_TYPES]
    return list(chain(*messages))

In [None]:
def join_all_logs(files):
    all_data = []
    
    for i,log_file in enumerate(files):
        with open(log_file) as f:
            data = json.load(f)
            messages = preprocess_data(data, i)
            all_data.extend(messages)
            print(len(data))
    
    print(len(all_data))
    return all_data

## Extract all callListUpdate events and save them to file

In [None]:
def get_all_events(files):
    all_data = []
    
    for i,log_file in enumerate(files):
        with open(log_file) as f:
            data = json.load(f)
            data = [el for el in data if el["type"] == "message" and el["message"]["type"] == "callListUpdate"]
            all_data.extend(data)
            print(len(data))
    
    print(len(all_data))
    return all_data

In [None]:
log_files = [get_absolute_path(file) for file in listdir(LOGS_DIR_PATH)]
data = get_all_events(log_files)

In [None]:
data[0].keys()

In [None]:
data[0]["message"].keys()

In [None]:
data[0]["message"]

In [None]:
data[0]

In [None]:
def write_json(data, filename='data.json'): 
    with open(filename,'w') as f: 
        json.dump(data, f) 

In [None]:
write_json(data, 'calls_data.json')

## Load and preprocess all logs data

### Load all logs data

In [None]:
log_files = [get_absolute_path(file) for file in listdir(LOGS_DIR_PATH)]
data = join_all_logs(log_files)
data

### Divide logs based on message type

In [None]:
df_info = pd.DataFrame([el for el in data if el["type"] == "callInfoUpdate"])
df_list = pd.DataFrame([el for el in data if el["type"] == "callListUpdate"])
df_roster = pd.DataFrame([el for el in data if el["type"] == "rosterUpdate"])

### Preprocess callInfoUpdate events

In [None]:
calls = list(pd.unique(df_list["call"]))

In [None]:
dfs_calls = []

for call in calls:
    call_data = df_list[df_list["call"] == call].sort_values(by=["date"])[["call", "updateType", "name", "participants", "streaming", "recording", "date", "reason"]]
    call_data["name"] = call_data["name"].fillna(method="ffill")
    dfs_calls.append(call_data)

In [None]:
dfs_calls[0]

In [None]:
dfs_calls[3]

In [None]:
all_calls = pd.concat(dfs_calls)
all_calls

In [None]:
all_calls.to_csv("all_calls.csv", index=None)

### Preprocess rosterUpdate

In [None]:
participants = pd.unique(df_roster["participant"])

dfs_roster = []

for p in participants:
    df_part = df_roster[df_roster["participant"] == p].sort_values(by=["date"])[["participant",
                                                                                 "updateType", 
                                                                                 "name", 
                                                                                 "uri", 
                                                                                 "state", 
                                                                                 "direction", 
                                                                                 "movedParticipant", 
                                                                                 "movedParticipantCallBridge", 
                                                                                 "canMove", 
                                                                                 "audioMuted", 
                                                                                 "videoMuted", 
                                                                                 "importance", 
                                                                                 "layout", 
                                                                                 "activeSpeaker", 
                                                                                 "presenter",
                                                                                 "endpointRecording",
                                                                                 "date"]]
    df_part["name"] = df_part["name"].fillna(method="ffill")
    df_part["uri"] = df_part["uri"].fillna(method="ffill")
    dfs_roster.append(df_part)

In [None]:
dfs_roster[60]

In [None]:
df_list[(df_list["date"]>"2020-06-02T09:17:20.002731")&(df_list["date"]<"2020-06-02T09:17:27.002731")]

In [None]:
meeting = df_list[df_list["call"]=="2abbb064-9e7d-4c67-9217-2fd2d43de646"].sort_values(by=["date"])

In [None]:
pd.unique(meeting["participants"])

In [None]:
all_participants = pd.concat(dfs_roster)
all_participants

In [None]:
all_participants.to_csv("all_participants.csv", index=None)

### Add call id to callInfoUpdate events

#### Create dataframe with call id, name, start date and end date -> joined

In [None]:
df_info.sort_values(by=["date"], inplace=True)

In [None]:
df_info["name"] = df_info["name"].fillna(method="ffill")

In [None]:
df_names = df_list[["call", "name", "date"]]

In [None]:
df_first = df_names.drop_duplicates(subset=["call"], keep="first")

In [None]:
df_last = df_names.drop_duplicates(subset=["call"], keep="last")

In [None]:
df_first.reset_index(inplace=True)

In [None]:
df_last.reset_index(inplace=True)

In [None]:
df_first.drop(columns=["index"], inplace=True)

In [None]:
df_last.drop(columns=["index"], inplace=True)

In [None]:
len(df_first)

In [None]:
len(df_last)

In [None]:
df_first.head()

In [None]:
df_last.head()

In [None]:
df_last

In [None]:
joined = df_first.merge(df_last, on="call", how="left")

In [None]:
joined.drop(columns=["name_y"], inplace=True)

In [None]:
joined.rename(columns={"name_x": "name", "date_x": "start_date", "date_y": "end_date"}, inplace=True)

#### Merge joined with df_info and add columns with date -> merged

In [None]:
df_info.reset_index(inplace=True)

In [None]:
df_info

In [None]:
merged = df_info.merge(joined, on="name", how="left")

In [None]:
from datetime import datetime

In [None]:
merged["start_date"]

In [None]:
pattern = '%Y-%m-%dT%H:%M:%S.%f'
merged["start_date_con"] = merged["start_date"].apply(lambda x: datetime.strptime(x, pattern))
merged["end_date_con"] = merged["end_date"].apply(lambda x: datetime.strptime(x, pattern))
merged["date_con"] = merged["date"].apply(lambda x: datetime.strptime(x, pattern))

In [None]:
merged["start_diff"] = abs(merged["date_con"] - merged["start_date_con"])

In [None]:
merged["end_diff"] = abs(merged["end_date_con"] - merged["date_con"])

In [None]:
merged["diff"] = merged.apply(lambda x: min(x["end_diff"], x["start_diff"]), axis=1)

In [None]:
merged.head()

In [None]:
len(df_info)

In [None]:
info_preprocessed = merged.loc[merged.groupby('index').diff.idxmin()].reset_index(drop=True)

In [None]:
len(info_preprocessed)

In [None]:
info_preprocessed.drop(columns=["index", "start_date", "end_date", "start_date_con", "end_date_con", "date_con", "start_diff", "end_diff", "diff"], inplace=True)

In [None]:
dict_info_prep = info_preprocessed.to_dict("records")

In [None]:
dict_info_prep[0]

In [None]:
def transform_event_info(event):
    final_event = dict()
    final_event["type"] = "message"
    callInfo = {
        "name": event["name"],
        "participants": event["participants"],
        "streaming": event["streaming"],
        "recording": event["recording"],
        "endpointRecording": event["endpointRecording"],
        "joinAudioMuteOverride": event["joinAudioMuteOverride"],
        "lockState": event["lockState"],
        "callType": event["callType"],
        "callCorrelator": event["callCorrelator"],
        "distributedInstances": event["distributedInstances"]
    }
    final_event["message"] = {
        "messageId": 1,
        "type": "callInfoUpdate",
        "subscriptionIndex": 2,
        "callInfo": callInfo
    }
    final_event["date"] = event["date"]
    final_event["call"] = event["call"]
    return final_event

In [None]:
mapped = list(map(transform_event_info, dict_info_prep))

In [None]:
write_json(mapped, 'callInfo_data.json')

### Add call id to rosterUpdate events

In [None]:
all_participants.sort_values(by=["date"], inplace=True)

In [None]:
df_users = all_participants[["participant", "date"]]

In [None]:
part_first = df_users.drop_duplicates(subset=["participant"], keep="first")

In [None]:
part_last = df_users.drop_duplicates(subset=["participant"], keep="last")

In [None]:
joined = part_first.merge(part_last, on="participant", how="left")

In [None]:
joined.rename(columns={"date_x": "start_date", "date_y": "end_date"}, inplace=True)

In [None]:
joined

In [None]:
df_names = df_list[["call", "name", "date"]]

In [None]:
df_first = df_names.drop_duplicates(subset=["call"], keep="first")

In [None]:
df_last = df_names.drop_duplicates(subset=["call"], keep="last")

In [None]:
df_first.reset_index(inplace=True)

In [None]:
df_last.reset_index(inplace=True)

In [None]:
joined_calls = df_first.merge(df_last, on="call", how="left")

In [None]:
joined_calls.drop(columns=["name_y"], inplace=True)

In [None]:
joined_calls.rename(columns={"name_x": "name", "date_x": "start_date", "date_y": "end_date"}, inplace=True)

In [None]:
joined_calls.drop(columns=["index_x", "index_y"], inplace=True)

In [None]:
joined_calls = joined_calls[["call", "start_date", "end_date"]]

In [None]:
joined["key"] = 1

In [None]:
joined_calls["key"] = 1

In [None]:
merged = pd.merge(joined,joined_calls,on='key').drop('key',axis=1)

In [None]:
merged.rename(columns={"start_date_x":"start_date_p", "end_date_x":"end_date_p", "start_date_y":"start_date_c", "end_date_y":"end_date_c"}, inplace=True)

In [None]:
merged.columns

In [None]:
pattern = '%Y-%m-%dT%H:%M:%S.%f'
merged["start_date_p_con"] = merged["start_date_p"].apply(lambda x: datetime.strptime(x, pattern))
merged["end_date_p_con"] = merged["end_date_p"].apply(lambda x: datetime.strptime(x, pattern))
merged["start_date_c_con"] = merged["start_date_c"].apply(lambda x: datetime.strptime(x, pattern))
merged["end_date_c_con"] = merged["end_date_c"].apply(lambda x: datetime.strptime(x, pattern))

In [None]:
merged["is_between"] = merged.apply(lambda row: row["start_date_p_con"]>=row["start_date_c_con"] and row["end_date_p_con"]<=row["end_date_c_con"], axis=1)

In [None]:
df_best = merged[merged["is_between"]]

In [None]:
len(pd.unique(df_best["participant"]))

In [None]:
len(pd.unique(df_best["participant"])) == len(pd.unique(joined["participant"]))

In [None]:
rest_participants = [pid for pid in joined["participant"] if pid not in list(df_best["participant"])]

In [None]:
rest_participants

In [None]:
counted = df_best.groupby(by=["participant"]).count()

In [None]:
counted[counted["end_date_p"]>1]

In [None]:
to_add = merged[merged["participant"].isin(rest_participants)]

In [None]:
concatenated = pd.concat([df_best, to_add])

In [None]:
len(concatenated)

In [None]:
concatenated["start_diff"] = abs(concatenated["start_date_p_con"] - concatenated["start_date_c_con"])

In [None]:
concatenated["end_diff"] = abs(concatenated["end_date_p_con"] - concatenated["end_date_c_con"])

In [None]:
concatenated["diff"] = concatenated.apply(lambda x: min(x["end_diff"], x["start_diff"]), axis=1)

In [None]:
concatenated.head()

In [None]:
user_preprocessed = concatenated.loc[concatenated.groupby('participant').diff.idxmin()].reset_index(drop=True)

In [None]:
len(user_preprocessed)

In [None]:
user_preprocessed["diff"].min()

In [None]:
maks = user_preprocessed["diff"].max()

In [None]:
user_preprocessed[user_preprocessed["diff"]==maks]

In [None]:
participant_call = user_preprocessed[["participant", "call"]]

In [None]:
final = all_participants.merge(participant_call, how="left", on="participant")

In [None]:
final.sort_values(by=["date"], inplace=True)

In [None]:
final.reset_index(drop=True)

In [None]:
type("a") == str

In [None]:
def get_value(column, event):
    if not event[column] or (type(event[column])!=str and math.isnan(event[column])):
        return None
    else:
        return event[column]

In [None]:
def transform_event_roster(event):
    final_event = dict()
    final_event["type"] = "message"
    columns = ["participant", "updateType", "name", "uri", "state", "direction", "movedParticipant",
              "movedParticipantCallBridge", "canMove", "audioMuted", "videoMuted", "importance", "layout", 
              "activeSpeaker", "presenter", "endpointRecording"]
    update = dict()
    for column in columns:
        value = get_value(column, event)
        if value:
            update[column] = value
    final_event["message"] = {
        "messageId": 1,
        "type": "rosterUpdate",
        "subscriptionIndex": 1,
        "updates": [update]
    }
    final_event["date"] = event["date"]
    final_event["call"] = event["call"]
    return final_event

In [None]:
mapped = list(map(transform_event_roster, final.to_dict("records")))

In [None]:
mapped

In [None]:
write_json(mapped, 'roster_data.json')