In [145]:
import pandas as pd
import numpy as np
import glob
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import json
from collections import Counter

In [94]:
def load_json_data(path):
    data_set = []
    with open(path) as f:
        json_objs = json.load(f)
        return json_objs

In [100]:
DATA_FOLDER = "./data/insq/agg/"
RAW_TRAIN_DATA = "./data/insq/train/"

In [138]:
test_data = load_json_data(DATA_FOLDER + "/test_valid.json")
dev_data = load_json_data(DATA_FOLDER + "/dev_valid.json")
train_data = load_json_data(DATA_FOLDER + "/train.json")

In [97]:
total_annotation = test_data
total_annotation.extend(dev_data)

In [98]:
even_vote_counts = {"informational motive": {"count":0, "cases":[]}, "social motive": {"count":0, "cases":[]}, "coordinative motive": {"count":0, "cases":[]}, "dialogue act": {"count":0, "cases":[]}, "target speaker(s)":{"count":0, "cases":[]}}


In [99]:
def is_vote_even(votes):
    votes_counts = Counter(list(votes.values()))
    total_votes = len(votes.keys())
    highest_vote = votes_counts.most_common()[0][1]
    if highest_vote / total_votes > 0.5:
        return False
    return True

In [76]:
for a in total_annotation:
    human_a = a["answer"]["human"]
    motives = human_a["motives"]
    if is_vote_even(human_a['dialogue act']['vote']):
        even_vote_counts['dialogue act']["count"] += 1
        even_vote_counts['dialogue act']["cases"].append(a)
    if is_vote_even(human_a['target speaker(s)']['vote']):
        even_vote_counts['target speaker(s)']["count"] += 1
        even_vote_counts['target speaker(s)']["cases"].append(a)
        
    if is_vote_even(motives['informational motive']['vote']):
        even_vote_counts['informational motive']["count"] += 1
        even_vote_counts['informational motive']["cases"].append(a)
    if is_vote_even(motives['social motive']['vote']):
        even_vote_counts['social motive']["count"] += 1
        even_vote_counts['social motive']["cases"].append(a)
    if is_vote_even(motives['coordinative motive']['vote']):
        even_vote_counts['coordinative motive']["count"] += 1
        even_vote_counts['coordinative motive']["cases"].append(a)
    

In [79]:
for k, v in even_vote_counts.items():
    print(f"{k}, {v['count']}, {v['count'] / len(total_annotation)}")

informational motive, 422, 0.09330090647800132
social motive, 226, 0.049966836170683177
coordinative motive, 548, 0.12115852310413443
dialogue act, 881, 0.19478222418748617
target speaker(s), 489, 0.10811408357284988


In [87]:
das = [a["answer"]["human"]["dialogue act"]["label"] for a in total_annotation]
das_counter = Counter(das)
das_counter

Counter({5: 712, 4: 1451, 3: 400, 0: 1107, 1: 171, 2: 682})

In [89]:
ims = [a["answer"]["human"]["motives"]["informational motive"]["label"] for a in total_annotation]
ims_counter = Counter(ims)
ims_counter

Counter({0.0: 2809, 1.0: 1714})

In [90]:
sms = [a["answer"]["human"]["motives"]["social motive"]["label"] for a in total_annotation]
sms_counter = Counter(sms)
sms_counter

Counter({1.0: 517, 0.0: 4006})

In [91]:
cms = [a["answer"]["human"]["motives"]["coordinative motive"]["label"] for a in total_annotation]
cms_counter = Counter(cms)
cms_counter

Counter({0.0: 2302, 1.0: 2221})

In [None]:
tss = [a["answer"]["human"]["target speaker(s)"]["label"] for a in total_annotation]
tss_counter = Counter(tss)
tss_counter

In [101]:
valid_dev = pd.read_excel(DATA_FOLDER + "/dev_valid_agg.xlsx", index_col=0)

In [117]:
valid_dev["dialogue act gpt"].value_counts()

2    609
0    377
5    259
4    185
1    184
3     95
Name: dialogue act gpt, dtype: int64

In [129]:
df = valid_dev[(valid_dev["dialogue act"] != valid_dev["dialogue act gpt"]) & (valid_dev["dialogue act gpt"] == 2)]


In [130]:
df["dialogue act"].value_counts()

4    190
0     76
5     59
3      8
1      3
Name: dialogue act, dtype: int64

In [133]:
tdf = df[df["dialogue act"] == 4][["id", 'target', 'dialogue act', 'dialogue act vote', 'dialogue act gpt', 'gpt promp', 'gpt reason']]

In [None]:
for i, r in tdf.iterrows():
    print("................................")
    print(r.id)
    print(r.target)
    print(r["dialogue act vote"])

In [144]:
train_data_dic = {i["id"]: i for i in train_data}

In [None]:
raw_train_files = glob.glob(RAW_TRAIN_DATA + "*.xlsl")