In [1]:
import statistics
import sys

sys.path.append("../")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics as sm
import src.data.prepare_data as prepdata
import src.features.aggregation_helper as ha
import src.features.annotation_aggregator as aa
from sklearn.model_selection import train_test_split
from src.features.annotation_aggregator import (
    DawidSkeneAggregator,
    MaceAggregator,
    MajorityVoteAggregator,
)
from tqdm.notebook import tqdm

## Wikipedia dataset

In [2]:
dataset_name = "Wikipedia"
(
    class_mapping,
    gold_labels,
    annotator_labels_wide,
    annotator_labels_long,
) = prepdata.get_annotator_and_gold_labels_Wikipedia()
label = class_mapping.keys()

Loaded data
----------------------------------
115864 comments
4053 annotators
2 classes: [0.0, 1.0]
----------------------------------
Running majority voting
Completed majority_voting in 0:00:01.374915 seconds



## Getting the bias matrices of the annotators

In [3]:
annotator_ids, annotator_bias_matrices = ha.get_bias_matrix_per_annotator(
    annotator_labels_wide, gold_labels, [0.0, 1.0]
)
annotator_ids_2, annotator_bias_matrices_2 = ha.get_bias_matrix_per_annotator(
    annotator_labels_wide, gold_labels, [0.0, 1.0], normalize=None
)

In [4]:
# Transform bias matrices of annotators so that it can be converted to a dataframe
vec_length = annotator_bias_matrices.shape[1] * annotator_bias_matrices.shape[2]
annotator_bias_matrices_flattend = annotator_bias_matrices.reshape(-1, vec_length)

# Create dataframe
df_annotator_characteristics = pd.DataFrame(
    annotator_bias_matrices_flattend,
    index=annotator_ids,
    columns=["Reliability 1", "Pessimistic", "Optimistic", "Reliability 2"],
)
df_annotator_characteristics = df_annotator_characteristics.reset_index()

## Group the annoators 
The grouping used in the paper is "Group 1". "Group 2" uses a different grouping function.

In [5]:
median_pessimistic = statistics.median(df_annotator_characteristics["Pessimistic"])
median_optimistic = statistics.median(df_annotator_characteristics["Optimistic"])


def group_by_qudrant(p, o):
    if p < median_pessimistic and o < median_optimistic:
        return 0
    if p < median_pessimistic and o >= median_optimistic:
        return 1
    if p >= median_pessimistic and o < median_optimistic:
        return 2
    if p >= median_pessimistic and o >= median_optimistic:
        return 3


def group_by_linear_function(p, o):
    fact = 3
    if p >= fact * o:
        return 0
    if o >= fact * p:
        return 2
    return 1


df_annotator_characteristics["Group 1"] = df_annotator_characteristics.apply(
    lambda x: group_by_linear_function(x["Pessimistic"], x["Optimistic"]), axis=1
)
df_annotator_characteristics["Group 2"] = df_annotator_characteristics.apply(
    lambda x: group_by_qudrant(x["Pessimistic"], x["Optimistic"]), axis=1
)

## Calculate the number of documents annotated by all groups

In [7]:
groups = []
group_labels = []
group_type = "Group 1"
for cluster in set(df_annotator_characteristics[group_type].to_list()):
    ids_annotators = set(
        df_annotator_characteristics[
            df_annotator_characteristics[group_type] == cluster
        ]["index"].to_list()
    )
    groups.append(ids_annotators)
    group_labels.append(cluster)

print("Number of comments in groups:")
overlapping_comments = set(annotator_labels_long["id"].to_list())
min_threshold = 1
for i, cluster in enumerate(groups):
    if len(cluster) < min_threshold:
        continue
    df_select = annotator_labels_long[annotator_labels_long["annotator"].isin(cluster)]
    overlapping_comments = overlapping_comments.intersection(
        set(df_select["id"].to_list())
    )
    print(
        group_labels[i],
        "\t",
        len(cluster),
        "\t",
        len(list(set(df_select["id"].to_list()))),
    )
print("Overlapping documents:", len(overlapping_comments))

Number of comments in groups:
0 	 1312 	 93350
1 	 1033 	 112221
2 	 1708 	 115757
Overlapping documents: 90019


## Calculate the labels for each annotator group
The label of a group is based on the majority vote of the annotators that are in the group

In [11]:
def get_gold_labels(text_ids, annotator_ids, df):
    df = df[df["id"].isin(text_ids) & df["annotator"].isin(annotator_ids)]
    df = df.sort_values(by=["id"])
    mja = MajorityVoteAggregator()
    mja.aggregate(df)
    mja_gold_labels = mja.get_aggregation(kind="df", return_text=False)
    return mja_gold_labels

In [12]:
biased_annotations = gold_labels[
    gold_labels["id"].isin(overlapping_comments)
].sort_values(by=["id"])
biased_annotations = biased_annotations.astype({f"label": int})

if dataset_name == "Wikipedia":
    df_to_merge = annotator_labels_long.drop_duplicates(subset=["id"])
    biased_annotations = biased_annotations.merge(
        df_to_merge[["id", "text"]], left_on="id", right_on="id", how="left"
    )

for group, group_label in zip(groups, group_labels):
    # get biased annotations
    biased_annotations[f"label_{group_label}"] = get_gold_labels(
        overlapping_comments, group, annotator_labels_long
    )["label"].to_list()
    # change type of added column
    biased_annotations = biased_annotations.astype({f"label_{group_label}": int})

    
biased_annotations.head()

----------------------------------
90019 comments
1312 annotators
2 classes: [0.0, 1.0]
----------------------------------
Running majority voting
Completed majority_voting in 0:00:00.827969 seconds

----------------------------------
90019 comments
1033 annotators
2 classes: [0.0, 1.0]
----------------------------------
Running majority voting
Completed majority_voting in 0:00:00.839944 seconds

----------------------------------
90019 comments
1708 annotators
2 classes: [0.0, 1.0]
----------------------------------
Running majority voting
Completed majority_voting in 0:00:00.908655 seconds



Unnamed: 0,id,label,text,label_0,label_1,label_2
0,37675,0,`- This is not ``creative``. Those are the di...,0,0,0
1,44816,0,` the term ``standard model`` is itself less...,0,0,0
2,89320,0,"Next, maybe you could work on being less cond...",0,1,0
3,93890,0,This page will need disambiguation.,0,0,0
4,102817,0,- Important note for all sysops There is a b...,0,0,0


## Calculate the differences between the labels of the different groups (in percent)

In [10]:
labels = ["label", "label_0", "label_1", "label_2"]
results = []
for lab_0 in labels:
    row = []
    for lab_1 in labels:
        row.append(
            sm.f1_score(
                biased_annotations[lab_0].to_list(),
                biased_annotations[lab_1].to_list(),
                average="macro",
            )
        )
    results.append(row)

pd.DataFrame(data=results, index=labels, columns=labels)

Unnamed: 0,label,label_0,label_1,label_2
label,1.0,0.740325,0.857091,0.882751
label_0,0.740325,1.0,0.759019,0.65411
label_1,0.857091,0.759019,1.0,0.760798
label_2,0.882751,0.65411,0.760798,1.0


## Split data in train, validation, and test set and store them

In [24]:
df_train_validation, df_test = train_test_split(
    biased_annotations, test_size=0.2, shuffle=True, random_state=5
)
df_train, df_validation = train_test_split(df_train_validation, test_size=len(df_test))

path_name = f"{dataset_name}_{group_type}".replace(" ", "_")
df_train.to_pickle(f"../tmp/{path_name}_bias_train.pkl")
df_test.to_pickle(f"../tmp/{path_name}_bias_test.pkl")
df_validation.to_pickle(f"../tmp/{path_name}_bias_validation.pkl")