# ===========================================================
# Generating a supervised dataset from the Jeopardy-like logs
# ===========================================================

## Goals:
####   1. Generate different networks from log (sentiment, emotion, and reply duration based)
####   2. Generate text embedding data
####   3. Map all to influence (appraisal) matrix as the groundtruth to estimate
####   4. Use LSTM to take the order (time) also into account

#### Last update: 04 Dec 2019

# Imports

In [1]:
from __future__ import division, print_function, absolute_import, unicode_literals

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import imp
import networkx as nx
from collections import defaultdict
import sys
sys.path.insert(0, '../src/')
%matplotlib inline

# import Softmax_Loss
import text_processor
import pogs_jeopardy_log_lib
import broadcast_network_extraction
import utils
from mytools import Timer

  from ._conv import register_converters as _register_converters
  from pandas.core import datetools


In [2]:
def reload():
    imp.reload(pogs_jeopardy_log_lib)
    imp.reload(text_processor)
    imp.reload(utils)
    imp.reload(broadcast_network_extraction)

In [3]:
reload()

In [4]:
net_extractor = broadcast_network_extraction.NetworkExtraction()
content_fixer = text_processor.FormalEnglishTranslator('../bagofwords/slang.txt')

# Parameters

In [5]:
directory = '/home/omid/Datasets/Jeopardy/'
time_window = [2, 10]
apply_content_fixer = True
fix_spelling = False
start_index = 0
skip_matrices_not_completely_from_members = True

# Helper functions

In [34]:
data = utils.load_it(directory+'Teams_logs.pk')
contents = utils.load_it(directory+'Teams_contents.pk')
networks = utils.load_it(directory+'Teams_networks.pk')
supervised_data = utils.load_it(directory+'supervised_data.pk')
contents_embeddings = utils.load_it(directory + 'content_embeddings_with_bert_base.pk')
individual_performance_rates = utils.load_it(directory + 'Teams_individual_performance.pk')

# Loading teams' logs

In [6]:
with Timer():
    teams = pd.read_csv(
        directory+"team.csv",
        sep=',',
        quotechar="|",
        names=["id","sessionId","roundId", "taskId"])
    data = {}
    for team_id in teams.id:
        print("Processing team", team_id, '...')
        try:
            data[team_id] = pogs_jeopardy_log_lib.TeamLogProcessor(
                team_id=team_id, logs_directory_path=directory)
        except pogs_jeopardy_log_lib.EventLogsNotLoadedError as e:
            print('Team {} is not found in the logs. There is nothing we can do.'.format(team_id))
            continue
        except Exception as e2:
            print('Team {} had some problems. Check.'.format(team_id))
            continue

Processing team 7 ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  event_log_no_message["sender_subject_id"] = pd.to_numeric(event_log_no_message["sender_subject_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  elNoMessage["sender_subject_id"] = pd.to_numeric(elNoMessage["sender_subject_id"])


Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 35 ...
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 50 ...
Team 50 is not found in the logs. There is nothing we can do.
Processing team 54 ...
Team 54 is not found in the logs. There is nothing we can do.
Processing team 61 ...
Te

In [159]:
utils.save_it(data, directory+'Teams_logs.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_logs.pk is successfully saved.


In [160]:
print(len(data))
data.keys()

49


dict_keys([7, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 70, 71, 72, 73, 74, 75, 77, 79, 82, 84, 85, 87, 88])

In [33]:
# for team_id, team_log in data.items():
#     messagesby5 = len(team_log.messages) // 5
#     matrices = len(team_log.member_influences)
#     if messagesby5 != matrices:
#         print(team_id, ': ', messagesby5, matrices)

7 :  8 9
16 :  3 2
27 :  2 1
35 :  1 0
40 :  7 6
70 :  7 8


# Fixing the language of messages

In [41]:
with Timer():
    if apply_content_fixer:
        for team_id, team_log in data.items():
            for i in range(len(team_log.messages)):
                team_log.messages[i] = content_fixer.translate_messages(
                    messages=team_log.messages[i],
                    message_column_name='event_content',
                    fix_spelling=fix_spelling)

It took 1.20 seconds.


# Comibing logs before reporting the appraisal matrices

In [42]:
combined_logs = {}
for team_id, team_log in data.items():
    print("Processing team", team_id, '...')
    this_team_nets = []
    this_team_number_of_networks = min(
        len(team_log.messages) // 5,
        len(team_log.member_influences))
    all_messages_before_appraisal_reports = []
    for i in range(this_team_number_of_networks):
        all_messages_before_appraisal_reports.append(
            pd.concat(
                [team_log.messages[i] for i in np.arange(i * 5, i * 5 + 5)]))
    if len(all_messages_before_appraisal_reports) > 0:
        combined_logs[team_id] = all_messages_before_appraisal_reports
    else:
        print('Team', team_id, 'does not have enough logs.')

Processing team 7 ...
Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 35 ...
Team 35 does not have enough logs.
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 70 ...
Processing team 71 ...
Processing team 72 ...
Team 72 does not have enough logs.
Processing team 73 ...
Team 73 doe

# Extracting differnet networks from the combined logs

In [12]:
with Timer():
    networks = {}
    for team_id, all_messages_before_appraisal_reports in combined_logs.items():
        print("Processing team", team_id, '...')
        this_team_nets = []
        for all_messages_before_appraisal_report in all_messages_before_appraisal_reports:
            reply_duration_net = net_extractor.extract_network_from_broadcast(            
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.REPLY_DURATION,
                aggregation_type=broadcast_network_extraction.AggregationType.SUM,
                gamma=0.15,
                node_list=data[team_id].members)
            sentiment_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.SENTIMENT,
                aggregation_type=broadcast_network_extraction.AggregationType.SUM,
                node_list=data[team_id].members)
            emotion_arousal_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_AROUSAL,
                aggregation_type=broadcast_network_extraction.AggregationType.SUM,
                node_list=data[team_id].members)
            emotion_dominance_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_DOMINANCE,
                aggregation_type=broadcast_network_extraction.AggregationType.SUM,
                node_list=data[team_id].members)
            emotion_valence_net = net_extractor.extract_network_from_broadcast(
                communication_data=all_messages_before_appraisal_report,
                time_window=time_window,
                weight_type=broadcast_network_extraction.WeightType.EMOTION_VALENCE,
                aggregation_type=broadcast_network_extraction.AggregationType.SUM,
                node_list=data[team_id].members)
            if len(reply_duration_net.nodes()) > 0:
                this_team_nets.append({
                    'sentiment': sentiment_net,
                    'reply_duration': reply_duration_net,
                    'emotion_arousal': emotion_arousal_net,
                    'emotion_dominance': emotion_dominance_net,
                    'emotion_valence': emotion_valence_net})
        if len(this_team_nets) > 0:
            networks[team_id] = this_team_nets
        else:
            print('Team', team_id, 'did not have enough networks.')

Processing team 7 ...
Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 70 ...
Processing team 71 ...
Processing team 74 ...
Processing team 75 ...
Processing team 77 ...
Processing team 79 ...
Processing team 82 ...
Processing team 84 ...
Processing t

In [376]:
utils.save_it(networks, directory+'Teams_networks.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_networks.pk is successfully saved.


In [377]:
len(networks)

46

In [378]:
print('Theses teams did not have networks: ',
      set(data.keys()) - set(networks.keys()))

Theses teams did not have networks:  {72, 73, 35}


### (Just saving the appraisal matrices and performances)

In [15]:
short_data = {}
for team_id, team_log in data.items():
    short_data[team_id] = {'scores': team_log.score, 'influences': team_log.member_influences}

In [16]:
utils.save_it(short_data, directory+'Appraisals_and_scores_data.pk')

# Extracting content of all texts that every person sent from combined logs

In [47]:
contents = {}
for team_id, all_messages_before_appraisal_reports in combined_logs.items():
    print("Processing team", team_id, '...')
    member_concat_messages = []
    for all_messages_before_appraisal_report in all_messages_before_appraisal_reports:
        this_time_member_concat_messages = []
        for member in sorted(data[team_id].members):
            sentences = '[CLS] ' + ' [SEP] '.join(
                all_messages_before_appraisal_report[
                    all_messages_before_appraisal_report.sender_subject_id == member].event_content) + ' [SEP]'
            this_time_member_concat_messages.append(sentences)
        member_concat_messages.append(this_time_member_concat_messages)
    contents[team_id] = member_concat_messages

Processing team 7 ...
Processing team 10 ...
Processing team 11 ...
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
Processing team 15 ...
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
Processing team 22 ...
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
Processing team 34 ...
Processing team 36 ...
Processing team 37 ...
Processing team 38 ...
Processing team 39 ...
Processing team 40 ...
Processing team 41 ...
Processing team 42 ...
Processing team 43 ...
Processing team 44 ...
Processing team 45 ...
Processing team 46 ...
Processing team 47 ...
Processing team 48 ...
Processing team 49 ...
Processing team 70 ...
Processing team 71 ...
Processing team 74 ...
Processing team 75 ...
Processing team 77 ...
Processing team 79 ...
Processing team 82 ...
Processing team 84 ...
Processing t

In [48]:
utils.save_it(contents, directory+'Teams_contents.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_contents.pk is successfully saved.


## After using colabs to generate embedding vectors from bert model for contents we load the pickled file in the following

In [49]:
contents_embeddings = utils.load_it(directory + 'content_embeddings_with_bert_large_clssep_added.pk')
# contents_embeddings = utils.load_it(directory + 'content_embeddings_with_bert_base_clssep_added.pk')
# contents_embeddings = utils.load_it(directory + 'content_embeddings_with_bert_base.pk')
# contents_embeddings = utils.load_it(directory + 'content_embeddings_with_roberta_base.pk')

# Extracting individual performance (skills) to add to the dataset

In [10]:
individual_performance_rates = defaultdict(list)

hardness_weights = {
    pogs_jeopardy_log_lib.Level.EASY: 1,
    pogs_jeopardy_log_lib.Level.MEDIUM: 2,
    pogs_jeopardy_log_lib.Level.HARD: 3}
questions = data[37].game_info.questions
for team_id, team_log in data.items():
    this_team_members_performance = defaultdict(
        lambda: {'#correct': 0,
                 '#questions': 0,
                 '#hardness_weighted_correct': 0,
                 '#hardness_weighted_questions': 0})
    for index, qid in enumerate(team_log.question_order):
        question_hardness_weight = hardness_weights[questions[qid].level]
        correct_answer = questions[qid].answer
        for member, member_answer in team_log.individual_answers_chosen[qid].items():
            this_team_members_performance[member]['#questions'] += 1
            this_team_members_performance[member]['#hardness_weighted_questions'] += question_hardness_weight
            if member_answer == correct_answer:
                this_team_members_performance[member]['#correct'] += 1
                this_team_members_performance[member]['#hardness_weighted_correct'] += question_hardness_weight
        if (index + 1) % 5 == 0:
            so_far_individual_performance = {}
            for member in team_log.members:
                correct_rate = this_team_members_performance[member]['#correct'] / this_team_members_performance[member]['#questions']
                hardness_weighted_correct_rate = this_team_members_performance[member]['#hardness_weighted_correct'] / this_team_members_performance[member]['#hardness_weighted_questions']
                so_far_individual_performance[member] = {
                    'correct_rate_so_far': correct_rate,
                    'hardness_weighted_correct_rate_so_far': hardness_weighted_correct_rate}
            individual_performance_rates[team_id].append(so_far_individual_performance)

In [11]:
len(individual_performance_rates)

47

In [225]:
utils.save_it(individual_performance_rates, directory+'Teams_individual_performance.pk', verbose=True)

/home/omid/Datasets/Jeopardy/Teams_individual_performance.pk is successfully saved.


# Generating the dataset

In [55]:
with Timer():
    X = []
    y = []
    for team_id, team_log in data.items():
        if team_id in networks:
            print("Processing team", team_id, '...')

            # First influence matrix:
            first_index = 0
            while first_index < len(networks[team_id]):
                influence_matrix = np.matrix(team_log.member_influences[first_index])
                if skip_matrices_not_completely_from_members and np.sum(team_log.member_influences_from_data[first_index]) != 16:
                    print('E1: Index: {} was skipped.'.format(first_index))
                    first_index += 1
                    continue
                normalized_influence_matrix = utils.shuffle_matrix_in_given_order(
                        matrix=influence_matrix,
                        order=np.argsort(team_log.members)) / 100
                first_row_stochastic_normalized_influence_matrix = np.matrix(
                    utils.make_matrix_row_stochastic(normalized_influence_matrix))
                previous_row_stochastic_normalized_influence_matrix = first_row_stochastic_normalized_influence_matrix.copy()
                break

            # Average of previous influence matrices:
            previous_influence_matrices_cnt = 1
            average_of_previous_influence_matrices = first_row_stochastic_normalized_influence_matrix.copy()
            for index in range(first_index + 1, len(networks[team_id])):
                influence_matrix = np.matrix(team_log.member_influences[index])
                if skip_matrices_not_completely_from_members and np.sum(team_log.member_influences_from_data[index]) != 16:
                    print('E2: Index: {} was skipped.'.format(index))
                    continue

                # Individual performance:
                individual_performance = np.zeros(4)
                individual_performance_hardness_weighted = np.zeros(4)
                perf_rates = individual_performance_rates[team_id][index]
                for i, member in enumerate(sorted(team_log.members)):
                    individual_performance[i] = perf_rates[member]['correct_rate_so_far']
                    individual_performance_hardness_weighted[i] = perf_rates[member]['hardness_weighted_correct_rate_so_far']

                # Networks:
                network = networks[team_id][index]

                # Contents:
                contents_embedding = contents_embeddings[team_id][index]

                # Average of previous influence matrices:
                normalized_influence_matrix = utils.shuffle_matrix_in_given_order(
                    matrix=influence_matrix,
                    order=np.argsort(team_log.members)) / 100
                row_stochastic_normalized_influence_matrix = np.matrix(
                    utils.make_matrix_row_stochastic(normalized_influence_matrix))

                # Multi-class classification (who is (are) the most influential individual(s)):
                most_influentials = utils.most_influential_on_others(
                    influence_matrix=row_stochastic_normalized_influence_matrix,
                    remove_self_influence=True)

                # Combining all features together:
                y.append({
                    'influence_matrix': row_stochastic_normalized_influence_matrix,
                    'most_influentials': most_influentials})
                X.append({
                    'individual_performance': individual_performance,
                    'individual_performance_hardness_weighted': individual_performance_hardness_weighted,
                    'content_embedding_matrix': contents_embedding,
                    'first_influence_matrix': first_row_stochastic_normalized_influence_matrix,
                    'previous_influence_matrix': previous_row_stochastic_normalized_influence_matrix,
                    'average_of_previous_influence_matrices': average_of_previous_influence_matrices / previous_influence_matrices_cnt,
                    'reply_duration': nx.adj_matrix(network['reply_duration']).todense(),
                    'sentiment': nx.adj_matrix(network['sentiment']).todense(),
                    'emotion_arousal': nx.adj_matrix(network['emotion_arousal']).todense(),
                    'emotion_dominance': nx.adj_matrix(network['emotion_dominance']).todense(),
                    'emotion_valence': nx.adj_matrix(network['emotion_valence']).todense()})
                previous_row_stochastic_normalized_influence_matrix = row_stochastic_normalized_influence_matrix.copy()
                average_of_previous_influence_matrices += row_stochastic_normalized_influence_matrix
                previous_influence_matrices_cnt += 1

    supervised_data = {'X': X, 'y': y}

Processing team 7 ...
E1: Index: 0 was skipped.
E1: Index: 1 was skipped.
E1: Index: 2 was skipped.
E1: Index: 3 was skipped.
E2: Index: 5 was skipped.
E2: Index: 6 was skipped.
E2: Index: 7 was skipped.
Processing team 10 ...
Processing team 11 ...
E2: Index: 1 was skipped.
Processing team 12 ...
Processing team 13 ...
Processing team 14 ...
E2: Index: 2 was skipped.
Processing team 15 ...
E1: Index: 0 was skipped.
E2: Index: 2 was skipped.
Processing team 16 ...
Processing team 17 ...
Processing team 19 ...
Processing team 20 ...
Processing team 21 ...
E2: Index: 3 was skipped.
E2: Index: 4 was skipped.
E2: Index: 6 was skipped.
Processing team 22 ...
E2: Index: 3 was skipped.
Processing team 23 ...
Processing team 27 ...
Processing team 28 ...
Processing team 30 ...
E2: Index: 4 was skipped.
Processing team 31 ...
Processing team 32 ...
Processing team 33 ...
E2: Index: 1 was skipped.
E2: Index: 8 was skipped.
Processing team 34 ...
E2: Index: 1 was skipped.
Processing team 36 ...
P

In [56]:
len(supervised_data['y'])

264

In [57]:
print(len(supervised_data['X'][0].keys()))
supervised_data['X'][0].keys()

11


dict_keys(['individual_performance_hardness_weighted', 'previous_influence_matrix', 'reply_duration', 'emotion_valence', 'average_of_previous_influence_matrices', 'sentiment', 'individual_performance', 'content_embedding_matrix', 'emotion_arousal', 'first_influence_matrix', 'emotion_dominance'])

In [60]:
utils.save_it(supervised_data, directory+'supervised_data_Jan28.pk', verbose=True)

/home/omid/Datasets/Jeopardy/supervised_data_Jan28.pk is successfully saved.


In [16]:
import pickle as pk
with open(directory+'supervised_data_Jan28_pickle2.pk', 'wb') as handle:
    pk.dump(supervised_data, handle, protocol=2)