All moved to [notebook](https://github.com/nnkennard/lab-notebook/blob/master/08/2021-08-14.md)

In [1]:
import collections
import itertools
import glob
import json
import pandas as pd
import seaborn as sns

In [2]:
def get_annotator(annotation):
    return annotation["metadata"]["anno"]

def get_rebuttal_alignments(annotation):
    return [set(x["labels"]["alignments"]) for x in annotation["rebuttallabels"]]

dataset_path = "/Users/nnayak/Downloads/05-16-provisional-emnlp-release-unsplit/"

results = collections.defaultdict(list)
adjudicated_results = collections.defaultdict(list)
all_results = collections.defaultdict(list)



for filename in glob.glob(dataset_path + "/*"):
    with open(filename, 'r') as f:
        obj = json.load(f)
    review = obj["metadata"]["review"]
    if get_annotator(obj) == 'anno0':
        adjudicated_results[review].append(obj)
    else:
        results[review].append(obj)
    all_results[review].append(obj)


In [3]:
matches = []
partial_matches = []
js_distances = []

class OverlapType(object):
    BOTH_NONE = "Agree none"
    ONE_NONE = "Disagree none"
    PARTIAL = "Partial match"
    NO_OVERLAP = "No overlap"
    EXACT = "Exact match"
    
    ALL = [
        EXACT,
    BOTH_NONE,
    PARTIAL,
    ONE_NONE,
    NO_OVERLAP,
    ]

def js_distance(set1, set2):
    return len(set1.intersection(set2))/len(set1.union(set2))

overlap_counter = collections.Counter()
exact_counter = collections.Counter()

for review, annotations in results.items():
    if len(annotations) == 1:
        continue
    else:
        annotators = [get_annotator(x) for x in annotations]
        for annotation_1, annotation_2 in itertools.combinations(annotations, 2):
            for ali_1, ali_2 in zip(get_rebuttal_alignments(annotation_1), get_rebuttal_alignments(annotation_2)):
                if not (ali_1.union(ali_2)):
                    overlap_counter[OverlapType.BOTH_NONE] += 1
                elif not ali_1 or not ali_2:
                    overlap_counter[OverlapType.ONE_NONE] += 1
                else:
                    if ali_1 == ali_2:
                        overlap_counter[OverlapType.EXACT] += 1
                        exact_counter[len(ali_1)] += 1
                    elif not (ali_1.intersection(ali_2)):
                        overlap_counter[OverlapType.NO_OVERLAP] += 1
                    else:
                        overlap_counter[OverlapType.PARTIAL] += 1
                        js_distances.append(js_distance(ali_1, ali_2))

barplot_dicts = []
for k, v in overlap_counter.items():
    barplot_dicts.append({
        "Overlap type": k,
        "Count": v
    })
overlap_type_df = pd.DataFrame.from_dict(barplot_dicts)

print(overlap_type_df)

#overlap_type_df
sns.barplot(data=overlap_type_df, x = "Overlap type", y="Count")


sns.histplot(js_distances)

Empty DataFrame
Columns: []
Index: []


In [None]:
class ContextType(object):
    NO_CONTEXT = "No context"
    GLOBAL_CONTEXT = "Global context"
    SINGLE_SENTENCE = "Single sentence"
    CONTIGUOUS_SENTENCES = "Contiguous sentences"
    NONCONTIGUOUS_SENTENCES = "Non-contiguous sentences"
    MYSTERIOUS = "Mysterious"
    
def is_contiguous(alignments):
    relevant_range = list(range(min(alignments), max(alignments) + 1))
    return relevant_range == list(sorted(alignments))

for review, annotations in adjudicated_results.items():
    for annotation in annotations:
        if not get_annotator(annotation) == "anno0":
            continue
        for label in annotation["rebuttallabels"]:
            alignment = label["labels"]["alignments"]
            if not alignment:
                context_counter[ContextType.MYSTERIOUS] += 1
            elif len(alignment) == 1:
                context_counter[ContextType.SINGLE_SENTENCE] += 1
            else:
                if is_contiguous(alignment):
                    context_counter[ContextType.CONTIGUOUS_SENTENCES] += 1
                else:
                    context_counter[ContextType.NONCONTIGUOUS_SENTENCES] += 1


In [None]:
# Count double annotations
adjudicated_test_dir = "/Users/nnayak/Downloads/0517_split_2/test/"

annotators = collections.Counter()
for filename in glob.glob(adjudicated_test_dir+"/*"):
    annotator = filename.split(".")[-2]
    annotators[annotator] += 1

#print(annotators, sum(annotators.values()))

In [None]:
# j = collections.defaultdict(lambda:collections.Counter())

# for example in sum(all_results.values(), []):
#     review_coarse_labels = [label["labels"]["coarse"] for label in example["reviewlabels"]]
#     for rebuttal_label in example["rebuttallabels"]:
#         coarse = rebuttal_label["labels"]["responsetype"]
#         for aligned_idx in rebuttal_label["labels"]["alignments"]:
#             j[coarse][review_coarse_labels[aligned_idx]] += 1
            
# print(j)

In [None]:
# import matplotlib.pyplot as plt

# fig, axes = plt.subplots(nrows=22, ncols=1, figsize=(10,50))

# review_types = "Request Evaluative Fact Structuring Other".split()

# for i, key in enumerate(sorted(j.keys())):

#     vals = j[key]
# #     print(vals)
# #     print(i, key)
#     axes[i].bar(review_types, [vals[i] for i in review_types])
#     axes[i].set_ylabel(key)


In [None]:
# for example in sum(all_results.values(), []):
#     for i, rebuttal_label in enumerate(example["rebuttallabels"]):
#         if rebuttal_label["labels"]["responsetype"] == "followup":
#             print(example["rebuttal"][i]['sentence'])

In [None]:
# import csv

# combis = collections.defaultdict(lambda:collections.Counter())


# agrees = collections.defaultdict(lambda:collections.Counter())
# disagrees = collections.defaultdict(lambda:collections.Counter())

# total_sentence_count = collections.Counter()

# FIELDS = "annotators label_type label_1 label_2 review_id sent_index sentence".split()
# DisagreementSentence = collections.namedtuple("DisagreementSentence", FIELDS)

# with open("disagree_sentences.tsv", 'w') as f:
#     writer = csv.DictWriter(f, fieldnames=FIELDS, delimiter="\t")
#     writer.writeheader()

#     for rev_id, annotations in all_results.items():
#         if len(annotations) == 1:
#             anno = annotations[0]["metadata"]["anno"]
#             total_sentence_count[anno] += len(annotations[0]["review"])
#         else:
#             for ann_1, ann_2 in itertools.combinations(annotations, 2):
#                 annos = sorted([ann_1["metadata"]["anno"], ann_2["metadata"]["anno"]], key=lambda x:int(x[4:]))
#                 rev_1 = [i["labels"] for i in ann_1["reviewlabels"]]
#                 rev_2 = [i["labels"] for i in ann_2["reviewlabels"]]
#                 sentences = [i["sentence"] for i in ann_1["review"]]
#                 assert len(rev_1) == len(rev_2)
#                 for i, (labels_1, labels_2, sent) in enumerate(zip(rev_1, rev_2, sentences)):
#                     for label_type in "coarse fine asp pol".split():
#                         labels = sorted(
#                                 [labels_1[label_type], labels_2[label_type]])
#                         combis[label_type][tuple(labels)] += 1
#                         if len(set(labels)) > 1:
#                             disagrees[annos[0]][annos[1]]  += 1
#                             writer.writerow(
#                                 DisagreementSentence(
#                                     "_".join(annos), label_type, labels[0], labels[1], rev_id, i, sent
#                                 )._asdict()
#                             )
#                         else:
#                             assert len(set(labels)) == 1
#                             agrees[annos[0]][annos[1]]  += 1
                        
                        

In [None]:

# import numpy as np
# annotators = ["anno{0}".format(i) for i in range(17)]
# annotators= "anno0 anno2 anno3 anno10 anno13 anno14".split()

# disagrees_df = np.array([[disagrees[i][j]/(agrees[i][j] + disagrees[i][j] + 1) for j in annotators] for i in annotators])
# sns.heatmap(data=disagrees_df)


In [None]:
# agrees_total = 0
# for k, v in agrees.items():
#     agrees_total += sum(v.values())
# disagrees_total = 0
# for k, v in disagrees.items():
#     disagrees_total += sum(v.values())

# print(agrees_total, disagrees_total)

# for anno1, annos in agrees.items():
#     for anno2, count in annos.items():
#         print(anno1, anno2, count, disagrees[anno1][anno2])

# for k, v in total_sentence_count.items():
#     print(k, v)