# ===========================================================
# Generating a supervised dataset from the Jeopardy-like logs
# ===========================================================

## Goals:
####   1. Generate different networks from log (sentiment, emotion, and reply duration based)
####   2. Generate text embedding data
####   3. Map all to influence (appraisal) matrix as the groundtruth to estimate
####   4. Use LSTM to take the order (time) also into account

#### Last update: 02 Dec 2019

# Imports

In [1]:
from __future__ import division, print_function, absolute_import, unicode_literals

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import heapq
import imp
import networkx as nx
from collections import defaultdict
import sys
sys.path.insert(0, '../src/')
%matplotlib inline

# import Softmax_Loss
import text_processor
import pogs_jeopardy_log_lib
import broadcast_network_extraction
import utils
from mytimer import Timer

  from ._conv import register_converters as _register_converters


In [2]:
def reload():
    imp.reload(pogs_jeopardy_log_lib)
    imp.reload(text_processor)
    imp.reload(utils)
    imp.reload(broadcast_network_extraction)

In [217]:
reload()

In [218]:
net_extractor = broadcast_network_extraction.NetworkExtraction()
content_fixer = text_processor.FormalEnglishTranslator('../bagofwords/slang.txt')

# Parameters

In [99]:
directory = '/home/omid/Datasets/Jeopardy/'
time_window = [2, 10]
apply_content_fixer = True
fix_spelling = False

# Helper functions

# Loading teams' logs

In [6]:
with Timer():
    teams = pd.read_csv(
        directory+"team.csv",
        sep=',',
        quotechar="|",
        names=["id","sessionId","roundId", "taskId"])
    data = {}
    for team_id in teams.id:
        print("Processing team", team_id, '...')
        try:
            data[team_id] = pogs_jeopardy_log_lib.TeamLogProcessor(
                team_id=team_id, logs_directory_path=directory)
        except pogs_jeopardy_log_lib.EventLogsNotLoadedError as e:
            print('Team {} is not found in the logs. There is nothing we can do.'.format(team_id))
            continue
        except Exception as e2:
            print('Team {} had some problems. Check.'.format(team_id))
            continue

Processing team 7 ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  event_log_no_message["sender_subject_id"] = pd.to_numeric(event_log_no_message["sender_subject_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  elNoMessage["sender_subject_id"] = pd.to_numeric(elNoMessage["sender_subject_id"])


Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 35 ...
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 50 ...
Team 50 is not found in the logs. There is nothing we can do.
Processing team 54 ...
Team 54 is not found in the logs. There is nothing we can do.
Processing team 61 ...
Te

In [10]:
utils.save_it(data, directory+'Teams_logs.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_logs.pk is successfully saved.


In [11]:
print(len(data))
data.keys()

49


dict_keys([7, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 70, 71, 72, 73, 74, 75, 77, 79, 82, 84, 85, 87, 88])

In [33]:
# for team_id, team_log in data.items():
#     messagesby5 = len(team_log.messages) // 5
#     matrices = len(team_log.member_influences)
#     if messagesby5 != matrices:
#         print(team_id, ': ', messagesby5, matrices)

7 :  8 9
16 :  3 2
27 :  2 1
35 :  1 0
40 :  7 6
70 :  7 8


# Fixing the language of messages

In [94]:
with Timer():
    if apply_content_fixer:
        for team_id, team_log in data.items():
            for i in range(len(team_log.messages)):
                team_log.messages[i] = content_fixer.translate_messages(
                    messages=team_log.messages[i],
                    message_column_name='event_content',
                    fix_spelling=fix_spelling)

It took 1.10 seconds.


# Comibing logs before reporting the appraisal matrices

In [254]:
combined_logs = {}
for team_id, team_log in data.items():
    print("Processing team", team_id, '...')
    this_team_nets = []
    this_team_number_of_networks = min(
        len(team_log.messages) // 5,
        len(team_log.member_influences))
    all_messages_before_appraisal_reports = []
    for i in range(this_team_number_of_networks):
        all_messages_before_appraisal_reports.append(
            pd.concat(
                [team_log.messages[i] for i in np.arange(i * 5, i * 5 + 5)]))
    if len(all_messages_before_appraisal_reports) > 0:
        combined_logs[team_id] = all_messages_before_appraisal_reports
    else:
        print('Team', team_id, 'does not have enough logs.')

Processing team 7 ...
Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 35 ...
Team 35 does not have enough logs.
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 70 ...
Processing team 71 ...
Processing team 72 ...
Team 72 does not have enough logs.
Processing team 73 ...
Team 73 doe

# Extracting differnet networks from the combined logs

In [308]:
with Timer():
    networks = {}
    for team_id, all_messages_before_appraisal_reports in combined_logs.items():
        print("Processing team", team_id, '...')
        this_team_nets = []
        for all_messages_before_appraisal_report in all_messages_before_appraisal_reports:
            reply_duration_net = net_extractor.extract_network_from_broadcast(            
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.REPLY_DURATION,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE,
                gamma=0.15,
                node_list=data[team_id].members)
            sentiment_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.SENTIMENT,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE,  # SUM
                node_list=data[team_id].members)
            emotion_arousal_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_AROUSAL,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE, # SUM
                node_list=data[team_id].members)
            emotion_dominance_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_DOMINANCE,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE, # SUM
                node_list=data[team_id].members)
            emotion_valence_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_VALENCE,
                aggregation_type=broadcast_network_extraction.AggregationType.AVERAGE, # SUM
                node_list=data[team_id].members)
            if len(reply_duration_net.nodes()) > 0:
                this_team_nets.append({
                    'sentiment': sentiment_net,
                    'reply_duration': reply_duration_net,
                    'emotion_arousal': emotion_arousal_net,
                    'emotion_dominance': emotion_dominance_net,
                    'emotion_valence': emotion_valence_net})
        if len(this_team_nets) > 0:
            networks[team_id] = this_team_nets
        else:
            print('Team', team_id, 'did not have enough networks.')

Processing team 7 ...
It took 0.02 seconds.


TypeError: extract_network_from_broadcast() got an unexpected keyword argument 'node_list'

In [293]:
utils.save_it(networks, directory+'Teams_networks.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_networks.pk is successfully saved.


In [294]:
len(networks)

46

In [295]:
print('Theses teams did not have networks: ',
      set(data.keys()) - set(networks.keys()))

Theses teams did not have networks:  {72, 73, 35}


# Extracting content of all texts that every person sent from combined logs

In [276]:
contents = {}
for team_id, all_messages_before_appraisal_reports in combined_logs.items():
    print("Processing team", team_id, '...')
    member_concat_messages = []
    for all_messages_before_appraisal_report in all_messages_before_appraisal_reports:
        this_time_member_concat_messages = []
        for member in sorted(data[team_id].members):
            this_time_member_concat_messages.append(' '.join(
                all_messages_before_appraisal_report[
                all_messages_before_appraisal_report.sender_subject_id == member].event_content))
        member_concat_messages.append(this_time_member_concat_messages)
    contents[team_id] = member_concat_messages

Processing team 7 ...
Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 70 ...
Processing team 71 ...
Processing team 74 ...
Processing team 75 ...
Processing team 77 ...
Processing team 79 ...
Processing team 82 ...
Processing team 84 ...
Processing t

In [280]:
utils.save_it(contents, directory+'Teams_contents.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_contents.pk is successfully saved.


# Generating the dataset

In [298]:
supervised_data = []
for team_id, team_log in data.items():
    print("Processing team", team_id, '...')
    if team_id in networks:
        for index in range(len(networks[team_id])):
            network = networks[team_id][index]
            influence_matrix = np.matrix(team_log.member_influences[index])
            supervised_data.append({
                'influence_matrix': utils.shuffle_matrix_in_given_order(
                    matrix=influence_matrix,
                    order=np.argsort(team_log.members)),
                'reply_duration': nx.adj_matrix(network['reply_duration']).todense(),
                'sentiment': nx.adj_matrix(network['sentiment']).todense(),
                'emotion_arousal': nx.adj_matrix(network['emotion_arousal']).todense(),
                'emotion_dominance': nx.adj_matrix(network['emotion_dominance']).todense(),
                'emotion_valence': nx.adj_matrix(network['emotion_valence']).todense()})

Processing team 7 ...
Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 35 ...
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 70 ...
Processing team 71 ...
Processing team 72 ...
Processing team 73 ...
Processing team 74 ...
Processing team 75 ...
Processing team 77 ...
Processing t

In [302]:
len(supervised_data)

325

In [305]:
cnt = 0
for item in supervised_data:
    if item['reply_duration'].shape[0] != 4:
        cnt += 1

In [307]:
cnt / 325

0.19076923076923077