In [None]:
import collections
import glob
import json
import pandas as pd

dataset_dir = "../data_prep/final_dataset/"

SUBSETS = "train dev test".split()

datasets = collections.defaultdict(list)

for subset in SUBSETS:
    for filename in glob.glob(dataset_dir + subset + "/*"):
        with open(filename, 'r') as f:
            datasets[subset].append(json.load(f))

# Overall statistics of the dataset

In [None]:
def count_dataset(pairs, subset):
    # TODO: Add double-annotated and adjudicated
    return {
        "subset":subset,
        "pairs": len(pairs),
        "forums": len(set(pair["metadata"]["forum_id"] for pair in pairs)),
        "adjudicated": len([pair for pair in pairs if pair["metadata"]["annotator"] == "anno0"]),
        "review_sentences": len(sum([pair["review_sentences"] for pair in pairs], [])),
        "rebuttal_sentences": len(sum([pair["rebuttal_sentences"] for pair in pairs], [])),
    }
# Distribution of examples over sets
df_dicts = [count_dataset(pairs, subset) for subset, pairs in datasets.items()]
pd.DataFrame.from_dict(df_dicts).transpose()

# Distribution over types

In [None]:
# def review_getter(obj):
#     my_counter = collections.Counter()
#     for sent in obj["reviewlabels"]:
#         my_counter[sent["labels"]["coarse"]] += 1
#         my_counter[sent["labels"]["fine"]] += 1
#         if sent["labels"]["asp"]:
#             my_counter[sent["labels"]["asp"]] += 1
#     return my_counter

    

In [None]:
with open('../final_data_dump/orda_annotations_0516.json', 'r') as f:
    p = json.load(f)
    
review_set = set()
for i in p["reviewsentenceannotation"]:
    if i["fields"]["initials"] == "TJO":
        review_set.add(i["fields"]["review_id"])

In [None]:
len(review_set)

In [None]:
forum_map = collections.defaultdict(lambda:collections.defaultdict(set))

for subset, pairs in datasets.items():
    for pair in pairs:
        forum_map[subset][pair["metadata"]["forum_id"]].add(pair["metadata"]["review_id"])

In [None]:
import openreview
guest_client = openreview.Client(baseurl='https://api.openreview.net')
def get_total_reviews(forum_id):
    return set([note.id
            for note in guest_client.get_notes(forum=forum_id)
            if note.replyto == forum_id 
            and "AnonReviewer" in note.signatures[0]])


for forum_id, review_ids in forum_map["train"].items():
    total_reviews = get_total_reviews(forum_id)
    diff = total_reviews - review_ids
    for i in diff:
        print(i, "MAD")