In [134]:
import glob
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from os.path import expanduser
from typing import Dict
from typing import List
from typing import Text
from typing import Tuple

import sys
sys.path.insert(0, '../src/')
import utils

In [135]:
directory = '~/Dropbox/PhD/Projects/Balance theory subject study/balance_theory_subject_study/src/testing_log/synthetic1_raw_logs'

taskid2taskname = {
        52: 'GD_solo_asbestos_initial',
        53: 'GD_group_asbestos1',
        62: 'GD_solo_asbestos1',
        57: 'GD_influence_asbestos1',
        167: 'GD_appraisal_asbestos1',
        61: 'GD_group_asbestos2',
        67: 'GD_solo_asbestos2',
        63: 'GD_influence_asbestos2',
        170: 'GD_appraisal_asbestos2',
        68: 'GD_group_asbestos3',
        70: 'GD_solo_asbestos3',
        69: 'GD_influence_asbestos3',
        172: 'GD_appraisal_asbestos3',
        
        71: 'GD_solo_disaster_initial',
        72: 'GD_group_disaster1',
        73: 'GD_solo_disaster1',
        74: 'GD_influence_disaster1',
        193: 'GD_appraisal_disaster1',
        79: 'GD_group_disaster2',
        76: 'GD_solo_disaster2',
        78: 'GD_influence_disaster2',
        194: 'GD_appraisal_disaster2',
        80: 'GD_group_disaster3',
        75: 'GD_solo_disaster3',
        77: 'GD_influence_disaster3',
        195: 'GD_appraisal_disaster3',

        83: 'GD_solo_sports_initial',
        89: 'GD_group_sports1',
        84: 'GD_solo_sports1',
        92: 'GD_influence_sports1',
        196: 'GD_appraisal_sports1',
        87: 'GD_group_sports2',
        85: 'GD_solo_sports2',
        91: 'GD_influence_sports2',
        197: 'GD_appraisal_sports2',
        88: 'GD_group_sports3',
        86: 'GD_solo_sports3',
        90: 'GD_influence_sports3',
        198: 'GD_appraisal_sports3',    
        
        94: 'GD_solo_school_initial',
        100: 'GD_group_school1',
        95: 'GD_solo_school1',
        104: 'GD_influence_school1',
        199: 'GD_appraisal_school1',
        99: 'GD_group_school2',
        96: 'GD_solo_school2',
        103: 'GD_influence_school2',
        200: 'GD_appraisal_school2',
        98: 'GD_group_school3',
        97: 'GD_solo_school3',
        102: 'GD_influence_school3',
        201: 'GD_appraisal_school3',        
        
        105: 'GD_solo_surgery_initial',
        109: 'GD_group_surgery1',
        106: 'GD_solo_surgery1',
        112: 'GD_influence_surgery1',
        202: 'GD_appraisal_surgery1',
        110: 'GD_group_surgery2',
        107: 'GD_solo_surgery2',
        113: 'GD_influence_surgery2',
        203: 'GD_appraisal_surgery2',
        111: 'GD_group_surgery3',
        108: 'GD_solo_surgery3',
        114: 'GD_influence_surgery3',
        204: 'GD_appraisal_surgery3'}

In [297]:
from typing import List, Text

In [298]:
def remove_later_duplicates(df: pd.DataFrame, list_of_columns_to_groupby: List[Text]) -> pd.DataFrame:
    new_df = df.sort_values(by='timestamp')
    indices = []
    for l in new_df.groupby(by=list_of_columns_to_groupby).indices.values():
        indices.append(l[-1])
    new_df = new_df.iloc[indices]
    new_df.sort_values(by='timestamp', inplace=True)
    new_df.reset_index(inplace=True, drop=True)
    return new_df

In [355]:
logs_filepath = '{}/EventLog_*.csv'.format(directory)
log_filepath = glob.glob(expanduser(logs_filepath))[0]
logs = pd.read_csv(log_filepath, sep=';', skiprows=1)
logs.dropna(axis=1, how='all', inplace=True)
logs = logs.sort_values(by='Timestamp')

# Ignoring all check_in logs.
logs = logs[~(logs['EventType'] == 'CHECK_IN')]

# Loading the completed task logs for learning the order of executed tasks.
task_orders_filepath = '{}/CompletedTask_*.csv'.format(directory)
task_filepath = glob.glob(expanduser(task_orders_filepath))[0]
task_orders = pd.read_csv(task_filepath, sep=';', skiprows=1)
task_orders.dropna(axis=1, how='all', inplace=True)

completed_taskid2taskname = {}
for _, row in task_orders.iterrows():
    completed_taskid2taskname[row.Id] = taskid2taskname[
        row.TaskId]

radio_selections = []
    
answers_dt = []
messages_dt = []
influences_dt = []
appraisals_dt = []
team_size = 3

for _, row in logs.iterrows():
    content_file_id = row.EventContent[9:]
    if not np.isnan(row.CompletedTaskId):
        question_name = completed_taskid2taskname[row.CompletedTaskId]
        sender = row.Sender
        timestamp = row.Timestamp
        event_type = row.EventType    
        json_file_path = '{}/EventLog/{}_*.json'.format(
            directory, content_file_id)
        json_file = glob.glob(expanduser(json_file_path))
        if len(json_file) != 1:
            print(
                'WARNING1: json file for id: {} was not found'
                ' in the EventLog folder.\n'.format(
                    content_file_id))
        else:
            with open(json_file[0], 'r') as f:
                content = json.load(f)
                if 'type' in content and content['type'] == 'JOINED':
                    continue
                if event_type == 'TASK_ATTRIBUTE':
                    input_str = ''
                    if str.startswith(question_name, 'GD_solo'):
                        if question_name.endswith('_initial'):
                            question_name = question_name.split(
                                '_initial')[0] + '0'
                        answers_dt.append(
                            [sender, question_name,
                            content['attributeStringValue'], timestamp])
                    elif str.startswith(question_name, 'GD_influence'):
                        radio_selections.append(radio_selection)
                        radio_selection = content['attributeStringValue']
                        radio_selection = radio_selection.replace('"', '').replace('[', '').replace(']', '')
                        influence_values = np.zeros(team_size)
                        for index, selection in enumerate(radio_selection.split(',')):
                            if str.isdigit(selection):
                                influence_values[index // 11] = selection
                        influences_dt.append(
                            [sender, question_name, influence_values, timestamp])
                    elif str.startswith(question_name, 'GD_appraisal'):
                        appraisals_dt.append(
                            [sender, question_name,
                            content['attributeName'],
                            content['attributeStringValue'], timestamp])
                    else:
                        print('WARNING4: There was an unexpected '
                        'question: {}\n'.format(question_name))
                elif event_type == 'COMMUNICATION_MESSAGE':
                    if len(content['message']) > 0:
                        messages_dt.append(
                            [sender, question_name,
                            content['message'], timestamp])
                else:
                    print('WARNING5: There was an unexpected'
                    ' EventType: {}\n'.format(event_type))

# Answers.
answers = remove_later_duplicates(
    df=pd.DataFrame(
        answers_dt,
        columns=['sender', 'question', 'value', 'timestamp']),
    list_of_columns_to_groupby=['sender', 'question'])

# Messages.
messages = pd.DataFrame(messages_dt, columns = [
    'sender', 'question', 'text', 'timestamp'])

# Influences.
influences = remove_later_duplicates(
    df=pd.DataFrame(
        influences_dt, columns = [
            'sender', 'question', 'value', 'timestamp']),
    list_of_columns_to_groupby=['sender', 'question'])

# Appraisals.
sep_appraisals = remove_later_duplicates(
    df=pd.DataFrame(
        appraisals_dt, columns = [
            'sender', 'question', 'attribute', 'value', 'timestamp']),
    list_of_columns_to_groupby=['sender', 'question', 'attribute'])
members = np.unique(sep_appraisals['sender'])
questions = np.unique(sep_appraisals['question'])
appraisals_dt = []
for member in members:
    for question in questions:
        appraisal_values = np.zeros(team_size)
        df = sep_appraisals[(sep_appraisals['sender'] == member) & (sep_appraisals['question'] == question)]
        for _, row in df.iterrows():
            appraisal_values[int(row.attribute[-1]) - 1] = row.value
        appraisals_dt.append([member, question, appraisal_values, df.timestamp.iloc[-1]])
appraisals = pd.DataFrame(
    appraisals_dt, columns = [
        'sender', 'question', 'value', 'timestamp'])


In [365]:
influences

Unnamed: 0,sender,question,value,timestamp
0,pogs01,GD_influence_disaster1,"[30.0, 30.0, 30.0]",2020-09-01 20:37:40
1,pogs02,GD_influence_disaster1,"[100.0, 0.0, 0.0]",2020-09-01 20:37:48
2,pogs03,GD_influence_disaster1,"[20.0, 60.0, 20.0]",2020-09-01 20:38:09
3,pogs01,GD_influence_disaster2,"[0.0, 0.0, 100.0]",2020-09-01 20:43:50
4,pogs02,GD_influence_disaster2,"[30.0, 30.0, 30.0]",2020-09-01 20:43:57
5,pogs03,GD_influence_disaster2,"[30.0, 40.0, 30.0]",2020-09-01 20:44:02
6,pogs01,GD_influence_disaster3,"[0.0, 0.0, 100.0]",2020-09-01 20:49:22
7,pogs02,GD_influence_disaster3,"[40.0, 30.0, 30.0]",2020-09-01 20:49:35
8,pogs03,GD_influence_disaster3,"[50.0, 0.0, 50.0]",2020-09-01 20:49:43


In [366]:
appraisals

Unnamed: 0,sender,question,value,timestamp
0,pogs01,GD_appraisal_disaster1,"[-1.0, 2.0, 0.0]",2020-09-01 20:39:35
1,pogs01,GD_appraisal_disaster2,"[10.0, 10.0, 10.0]",2020-09-01 20:44:42
2,pogs01,GD_appraisal_disaster3,"[10.0, 10.0, 10.0]",2020-09-01 20:50:35
3,pogs02,GD_appraisal_disaster1,"[10.0, -10.0, -10.0]",2020-09-01 20:40:23
4,pogs02,GD_appraisal_disaster2,"[-1.0, -2.0, -3.0]",2020-09-01 20:44:52
5,pogs02,GD_appraisal_disaster3,"[10.0, 1.0, 1.0]",2020-09-01 20:50:56
6,pogs03,GD_appraisal_disaster1,"[0.0, 0.0, -10.0]",2020-09-01 20:40:37
7,pogs03,GD_appraisal_disaster2,"[-10.0, -1.0, 10.0]",2020-09-01 20:45:20
8,pogs03,GD_appraisal_disaster3,"[-10.0, 10.0, -10.0]",2020-09-01 20:51:04


In [367]:
influences.as_matrix()

  """Entry point for launching an IPython kernel.


array([['pogs01', 'GD_influence_disaster1', array([30., 30., 30.]),
        '2020-09-01 20:37:40'],
       ['pogs02', 'GD_influence_disaster1', array([100.,   0.,   0.]),
        '2020-09-01 20:37:48'],
       ['pogs03', 'GD_influence_disaster1', array([20., 60., 20.]),
        '2020-09-01 20:38:09'],
       ['pogs01', 'GD_influence_disaster2', array([  0.,   0., 100.]),
        '2020-09-01 20:43:50'],
       ['pogs02', 'GD_influence_disaster2', array([30., 30., 30.]),
        '2020-09-01 20:43:57'],
       ['pogs03', 'GD_influence_disaster2', array([30., 40., 30.]),
        '2020-09-01 20:44:02'],
       ['pogs01', 'GD_influence_disaster3', array([  0.,   0., 100.]),
        '2020-09-01 20:49:22'],
       ['pogs02', 'GD_influence_disaster3', array([40., 30., 30.]),
        '2020-09-01 20:49:35'],
       ['pogs03', 'GD_influence_disaster3', array([50.,  0., 50.]),
        '2020-09-01 20:49:43']], dtype=object)

In [368]:
appraisals.as_matrix()

  """Entry point for launching an IPython kernel.


array([['pogs01', 'GD_appraisal_disaster1', array([-1.,  2.,  0.]),
        '2020-09-01 20:39:35'],
       ['pogs01', 'GD_appraisal_disaster2', array([10., 10., 10.]),
        '2020-09-01 20:44:42'],
       ['pogs01', 'GD_appraisal_disaster3', array([10., 10., 10.]),
        '2020-09-01 20:50:35'],
       ['pogs02', 'GD_appraisal_disaster1', array([ 10., -10., -10.]),
        '2020-09-01 20:40:23'],
       ['pogs02', 'GD_appraisal_disaster2', array([-1., -2., -3.]),
        '2020-09-01 20:44:52'],
       ['pogs02', 'GD_appraisal_disaster3', array([10.,  1.,  1.]),
        '2020-09-01 20:50:56'],
       ['pogs03', 'GD_appraisal_disaster1', array([  0.,   0., -10.]),
        '2020-09-01 20:40:37'],
       ['pogs03', 'GD_appraisal_disaster2', array([-10.,  -1.,  10.]),
        '2020-09-01 20:45:20'],
       ['pogs03', 'GD_appraisal_disaster3', array([-10.,  10., -10.]),
        '2020-09-01 20:51:04']], dtype=object)