# ===========================================================
# Generating a supervised dataset from the Jeopardy-like logs
# ===========================================================

## Goals:
####   1. Generate different networks from log (sentiment, emotion, and reply duration based)
####   2. Generate text embedding data
####   3. Map all to influence (appraisal) matrix as the groundtruth to estimate
####   4. Use LSTM to take the order (time) also into account

#### Last update: 02 Dec 2019

# Imports

In [1]:
from __future__ import division, print_function, absolute_import, unicode_literals

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import heapq
import imp
import networkx as nx
from collections import defaultdict
import sys
sys.path.insert(0, '../src/')
%matplotlib inline

# import Softmax_Loss
import text_processor
import pogs_jeopardy_log_lib
import broadcast_network_extraction
import utils
from mytimer import Timer

  from ._conv import register_converters as _register_converters


In [2]:
def reload():
    imp.reload(pogs_jeopardy_log_lib)
    imp.reload(text_processor)
    imp.reload(utils)
    imp.reload(broadcast_network_extraction)

In [45]:
reload()

In [46]:
net_extractor = broadcast_network_extraction.NetworkExtraction()
content_fixer = text_processor.FormalEnglishTranslator('../bagofwords/slang.txt')

# Parameters

In [69]:
directory = '/home/omid/Datasets/Jeopardy/'
time_window = [2, 10]
apply_content_fixer = True
fix_spelling = False

# Helper functions

# Loading teams' logs

In [6]:
with Timer():
    teams = pd.read_csv(
        directory+"team.csv",
        sep=',',
        quotechar="|",
        names=["id","sessionId","roundId", "taskId"])
    data = {}
    for team_id in teams.id:
        print("Processing team", team_id, '...')
        try:
            data[team_id] = pogs_jeopardy_log_lib.TeamLogProcessor(
                team_id=team_id, logs_directory_path=directory)
        except pogs_jeopardy_log_lib.EventLogsNotLoadedError as e:
            print('Team {} is not found in the logs. There is nothing we can do.'.format(team_id))
            continue
        except Exception as e2:
            print('Team {} had some problems. Check.'.format(team_id))
            continue

Processing team 7 ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  event_log_no_message["sender_subject_id"] = pd.to_numeric(event_log_no_message["sender_subject_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  elNoMessage["sender_subject_id"] = pd.to_numeric(elNoMessage["sender_subject_id"])


Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 35 ...
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 50 ...
Team 50 is not found in the logs. There is nothing we can do.
Processing team 54 ...
Team 54 is not found in the logs. There is nothing we can do.
Processing team 61 ...
Te

In [10]:
utils.save_it(data, directory+'Teams_logs.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_logs.pk is successfully saved.


In [11]:
print(len(data))
data.keys()

49


dict_keys([7, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 70, 71, 72, 73, 74, 75, 77, 79, 82, 84, 85, 87, 88])

In [33]:
# for team_id, team_log in data.items():
#     messagesby5 = len(team_log.messages) // 5
#     matrices = len(team_log.member_influences)
#     if messagesby5 != matrices:
#         print(team_id, ': ', messagesby5, matrices)

7 :  8 9
16 :  3 2
27 :  2 1
35 :  1 0
40 :  7 6
70 :  7 8


# Fixing the language of messages

In [94]:
with Timer():
    if apply_content_fixer:
        for team_id, team_log in data.items():
            for i in range(len(team_log.messages)):
                team_log.messages[i] = content_fixer.translate_messages(
                    messages=team_log.messages[i],
                    message_column_name='event_content',
                    fix_spelling=fix_spelling)

It took 1.10 seconds.


# Extracting differnet networks from the combined logs before reporting appraisal networks

In [96]:
with Timer():
    networks = {}
    for team_id, team_log in data.items():
        this_team_nets = []
        this_team_number_of_networks = min(
            len(team_log.messages) // 5,
            len(team_log.member_influences))
        for i in range(len(team_log.messages) // 5):
            all_messages_before_appraisal_report = pd.concat(
                [team_log.messages[i] for i in np.arange(i * 5, i * 5 + 5)])
            reply_duration_net = net_extractor.extract_network_from_broadcast(            
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.REPLY_DURATION,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE)
            sentiment_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.REPLY_DURATION,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE)
            emotion_arousal_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_AROUSAL,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE)
            emotion_dominance_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_DOMINANCE,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE)
            emotion_valence_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_VALENCE,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE)
            this_team_nets.append({
                'sentiment': sentiment_net,
                'reply_duration': reply_duration_net,
                'emotion_arousal': emotion_arousal_net,
                'emotion_dominance': emotion_dominance_net,
                'emotion_valence': emotion_valence_net})
        if len(this_team_nets) > 1:
            networks[team_id] = this_team_nets

It took 35.49 seconds.


IndexError: single positional indexer is out-of-bounds

In [None]:
utils.save_it(networks, directory+'Teams_networks.pk', verbose=True)