In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
import torch
import json
import pandas as pd
from tqdm import tqdm
from collections import Counter

# Vulnerable Identities Recognition Corpus (VIRC)

## Utils

This section contains all the methods used with the datasets. Is mandatory to run it for this notebook.

In [None]:

def full_agreement_per_sentence(disaggregated_annotations_aux, gold_annotator_id_aux, predictions_annotator_id_aux):
    # Number of headlines with full agreement
    full_agreement_len_aux= {"correct_aux": [], "incorrect_aux": []}
    for index, group in disaggregated_annotations_aux.groupby(['text_id']):
        _tmp_agreement_recall, _ = recall(group, gold_annotator_id_aux, predictions_annotator_id_aux)
        _tmp_agreement_precision, _ = precision(group, gold_annotator_id_aux, predictions_annotator_id_aux)
        _tmp_f1_scores = calculate_f1(_tmp_agreement_recall, _tmp_agreement_precision)
        is_correct= True # Assume correct label
        for results in _tmp_f1_scores:
            if results["f1_score"] <= 0:
                is_correct= False
                break
        if is_correct: 
            full_agreement_len_aux["correct_aux"].append(index)
        else: 
            full_agreement_len_aux["incorrect_aux"].append(index)

    # Count the headlines
    full_agreement_len_aux = {
            'correct': len(full_agreement_len_aux['correct_aux']),
            'incorrect': len(full_agreement_len_aux['incorrect_aux'])
    }

    #full_agreement_len_aux= pd.DataFrame.from_dict(full_agreement_len_aux, orient="index")
    return full_agreement_len_aux

def full_agreement_per_annotation(disaggregated_annotations_aux, gold_annotator_id_aux, predictions_annotator_id_aux):
    # Number of headlines with agreement per label

    agreement_label_aux= {}
    for index, group in disaggregated_annotations_aux.groupby(['text_id', 'label']):
        _tmp_agreement_recall, _ = recall(group, gold_annotator_id_aux, predictions_annotator_id_aux)
        _tmp_agreement_precision, _ = precision(group, gold_annotator_id_aux, predictions_annotator_id_aux)
        _tmp_f1_scores = calculate_f1(_tmp_agreement_recall, _tmp_agreement_precision)
        for results in _tmp_f1_scores:
            if results["label"] not in agreement_label_aux: agreement_label_aux[results["label"]]= {"correct_aux": [], "incorrect_aux": []}

            if results["f1_score"] > 0 and index[0] not in agreement_label_aux[results["label"]]["correct_aux"]:
                agreement_label_aux[results["label"]]["correct_aux"].append(index[0])

            if results["f1_score"] <= 0 and index[0] not in agreement_label_aux[results["label"]]["incorrect_aux"]:
                agreement_label_aux[results["label"]]["incorrect_aux"].append(index[0])

    # Count the headlines
    agreement_label_len_aux = {
        label: {
            'correct': len(data['correct_aux']),
            'incorrect': len(data['incorrect_aux'])
        }
        for label, data in agreement_label_aux.items()
    }

    agreement_label_len_aux= pd.DataFrame.from_dict(agreement_label_len_aux, orient="index").sort_index()
    return agreement_label_len_aux

def f_scores_per_label(disaggregated_annotations_aux, gold_annotator_id_aux, predictions_annotator_id_aux):
    # Calculate metrics
    agreement_recall_aux, _ = recall(disaggregated_annotations_aux, gold_annotator_id_aux, predictions_annotator_id_aux)
    agreement_precision_aux, _ = precision(disaggregated_annotations_aux, gold_annotator_id_aux, predictions_annotator_id_aux)
    f1_scores_aux = calculate_f1(agreement_recall_aux, agreement_precision_aux)

    df= disaggregated_annotations_aux # For having shorter code
    for performance in f1_scores_aux:
            performance["n_annotations"]  = len(df[ (df["label"]==performance["label"]) & (df["annotator_id"]==gold_annotator_id_aux) ])
            performance["n_annotations"] += len(df[ (df["label"]==performance["label"]) & (df["annotator_id"]==predictions_annotator_id_aux) ])

    f1_scores_aux= pd.DataFrame(f1_scores_aux).sort_values('label', ascending=True, ignore_index=True)
    return f1_scores_aux

def extract_annotations(path): # Method to get formatted annotations from a json path
    all_annotations = list()
    with open(path) as f:
        jsn = json.load(f)
    
    for item in jsn['tasks']:
        for el in item['task']['text_lines']:
            if 'annotations' in el:
                text = el['data']['text']
                _id = el['text_id'] 
                for annotations in el['annotations']:
                    for annotation in annotations:
                        d = dict()
                        d['text_id'] = _id
                        d['text'] = text
                        d['annotator_id'] = annotation['annotator_id']
                        d['purpose'] = annotation['value']['purpose']
                        d['label'] = annotation['value']['text'] if d['purpose'] == 'tagging' else None
                        d['comment'] = annotation['value']['text'] if d['purpose'] == 'commenting' else None
                        d['span'] = annotation['value']['interval']['exact_highlight']
                        d['start_idx'] = annotation['value']['interval']['start_at']
                        d['end_idx'] = annotation['value']['interval']['end_at']
                        d['text_external_id'] = el['text_external_id']
                        all_annotations.append(d)
    return all_annotations

def spans_fully_contain(span1, span2):
    """
    Check if one span fully contains the other span in either direction.

    Parameters:
    span1 (tuple): A tuple containing the start and end of the first span (start1, end1).
    span2 (tuple): A tuple containing the start and end of the second span (start2, end2).

    Returns:
    bool: True if either span fully contains the other, False otherwise.
    """
    start1, end1 = span1
    start2, end2 = span2
    return (start1 <= start2 and end1 >= end2) or (start2 <= start1 and end2 >= end1)

def spans_overlap(span1, span2):
    """Check if two spans overlap.
    
    Parameters:
    span1 (tuple): A tuple containing the start and end of the first span (start1, end1).
    span2 (tuple): A tuple containing the start and end of the second span (start2, end2).

    Returns:
    bool: True if either span overlap the other, False otherwise.
    """
    start1, end1 = span1
    start2, end2 = span2
    return max(start1, start2) <= min(end1, end2)  # Checks if there's any overlap


def recall(df, gold_annotator_id, predictions_annotator_id, generate_file=False):
    df = df[df.label.notna()]
    annotators = df.annotator_id.drop_duplicates().values
    results = list()
    report = list()
    for label in df.label.drop_duplicates().values:
        recall = list()
        tmp = df[df.label==label]
        n_labels = len(tmp[['label','text_id']].drop_duplicates())
        tmp = tmp.groupby(['text_id','annotator_id', 'text_external_id', 'start_idx', 'end_idx']).span.apply(list).reset_index()
        a_id = gold_annotator_id
        b_id = predictions_annotator_id
        for _, a_annotation in tmp[tmp.annotator_id==a_id].iterrows():
            # For each annotation of the first annotator
            mtc = 0 

            b_annotations = tmp[(tmp.annotator_id==b_id)&(tmp.text_id==a_annotation.text_id)] # Get the second annotator annotations for the same sentence
            for _, b_annotation in b_annotations.iterrows():
                if spans_fully_contain((a_annotation.start_idx, a_annotation.end_idx), (b_annotation.start_idx, b_annotation.end_idx)):
                    # Match, one span contains the other
                    mtc= 1 # True positive
                    continue
                elif spans_overlap((a_annotation.start_idx, a_annotation.end_idx), (b_annotation.start_idx, b_annotation.end_idx)): 
                    # Partial match
                    mtc = 0.5 # Partial true positive
                    continue

            if generate_file is True and mtc==0:
                report.append({'text_id':a_annotation.text_id,'label':label, 'text_external_id': a_annotation.text_external_id})
                
            recall.append(mtc)
                
        recall = Counter(recall)

        if recall[1] > 0:
            results.append({'label':label,'annotator_1':a_id,'annotator_2':b_id,'recall':recall[1]/(recall[1]+recall[0]),'support':recall[1]+recall[0]})
        else:
            results.append({'label':label,'annotator_1':a_id,'annotator_2':b_id,'recall':0,'support':recall[1]+recall[0]})

    return results,report

def precision(df, gold_annotator_id, predictions_annotator_id, generate_file=False):
    df = df[df.label.notna()]
    annotators = df.annotator_id.drop_duplicates().values
    results = list()
    report = list()
    for label in df.label.drop_duplicates().values:
        precision = list()
        tmp = df[df.label==label]
        n_labels = len(tmp[['label','text_id']].drop_duplicates())
        tmp = tmp.groupby(['text_id','annotator_id','text_external_id', 'start_idx', 'end_idx']).span.apply(list).reset_index()
        a_id = gold_annotator_id
        b_id = predictions_annotator_id
        for _, b_annotation in tmp[tmp.annotator_id==b_id].iterrows():
            mtc = 0 # If mtc is 0 is a false positive
            a_annotations = tmp[(tmp.annotator_id==a_id)&(tmp.text_id==b_annotation.text_id)] # Get the second annotator annotations for the same sentence
            for _, a_annotation in a_annotations.iterrows():
                if spans_fully_contain((a_annotation.start_idx, a_annotation.end_idx), (b_annotation.start_idx, b_annotation.end_idx)):
                    # Match, one span contains the other
                    mtc= 1 # True positive
                    continue
                elif spans_overlap((a_annotation.start_idx, a_annotation.end_idx), (b_annotation.start_idx, b_annotation.end_idx)):
                    # Partial match
                    mtc = 0.5 # True positive
                    continue
            if generate_file is True and mtc==0:
                report.append({'text_id':a_annotation.text_id,'label':label, 'text_external_id': a_annotation.text_external_id})
            
            precision.append(mtc)
            
        precision = Counter(precision)
        if precision[1] >0:
            results.append({'label':label,'annotator_1':a_id,'annotator_2':b_id,'precision':precision[1]/(precision[1]+precision[0]),'support':precision[1]+precision[0]})
        else:
            results.append({'label':label,'annotator_1':a_id,'annotator_2':b_id,'precision':0,'support':precision[1]+precision[0]})
    return results, report

def calculate_f1(agreement_recall, agreement_precision):
    f1_scores = []
    for recall_entry, precision_entry in zip(agreement_recall, agreement_precision):
        label = recall_entry['label']
        recall_value = recall_entry['recall']
        precision_value = precision_entry['precision']
        
        f1_value = f1(precision_value, recall_value)
        
        f1_scores.append({'label': label, 'f1_score': f1_value, 'recall': recall_value, 'precision': precision_value})
    
    return f1_scores

def f1(precision, recall):
    if precision + recall == 0:
        # If both recall and precision are 0, F1 should also be 0
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def majority_vote(annotations, final_annotator_id=None):
    """
    Returns a list of target spans that have majority agreement among the annotators.

    Parameters:
    - annotations: List of lists, where each inner list contains tuples representing 
                   (start_idx, end_idx) of annotation spans for each annotator for a specific sentence.

    - final_annotator_id: Id of the resulting annotation annotator. Default None.
    Returns:
    - List of tuples: Target spans that received majority agreement.
    """
    # Create a dictionary to hold the counts of votes for each span
    span_votes = {}

    all_data= {}
    strict_majority = True # To only take into account labels with majority vote
    on_overlap_mantain = max # Max for the biggest span, min for the smallest

    # Iterate over each annotator's spans
    for annotator_index_a, annotations_a in enumerate(annotations): # For each annotator
        for annotation_index_a, annotation_a in enumerate(annotations_a): # For each annotation
            
            idx_with_label_a = (annotation_a["start_idx"], annotation_a["end_idx"]), annotation_a["label"]
            all_data[idx_with_label_a]= annotation_a

            # Add the span itself to the votes
            if idx_with_label_a not in span_votes:
                span_votes[idx_with_label_a] = 0
            span_votes[idx_with_label_a] += 1

            overlapping_spans = []

            for annotator_index_b, annotator_spans_b in enumerate(annotations): # For each annotator
                for annotation_index_b, annotation_b in enumerate(annotator_spans_b): # For each annotation
                    if annotator_index_a == annotator_index_b and \
                        annotation_index_a == annotation_index_b: continue # omit the current annotation when overlap check
                    
                    idx_with_label_b= (annotation_b["start_idx"], annotation_b["end_idx"]), annotation_b["label"]

                    # If same label annotated
                    # if spans overlap
                    # if not previously counted
                    if  annotation_a["label"] == annotation_b["label"] and \
                        spans_overlap((annotation_a["start_idx"], annotation_a["end_idx"]), (annotation_b["start_idx"], annotation_b["end_idx"])) \
                        and idx_with_label_b not in overlapping_spans:
                        
                        # overlapping_spans.append(idx_with_label_b)
                        overlapping_spans.append(idx_with_label_a)
            
            if overlapping_spans: # If overlaps exist
                for existing_span in overlapping_spans: # For each overlap vote
                    if existing_span not in span_votes:
                        span_votes[existing_span] = 0
                    span_votes[existing_span] += 1 
    
    # del counted_overlaps # Delete aux array
    # Determine majority agreement spans
    majority_spans = {}
    total_annotators = len(annotations)

    for ((start_idx, end_idx), label), votes in span_votes.items():
        span_a= (start_idx, end_idx)
        if votes > (total_annotators / 2):  # More than half for majority
            if label not in majority_spans:
                majority_spans[label]= [(span_a, label)]
            else: # if label already exist can be two cases. Multiple different spans or overlapped span.
                is_overlapped= False
                for idx_majority, (span_b, label_b) in enumerate(majority_spans[label]):
                    if spans_overlap(span_a, span_b):
                        is_overlapped= True
                        majority_spans[label][idx_majority]= (on_overlap_mantain([span_a, span_b], key=lambda span: span[1] - span[0]), label_b)  # Sort by length of the span)

                if not is_overlapped: # If is a new label
                    majority_spans[label].append((span_a, label))
        else:
            if not strict_majority:
                # If not majority vote we can also include it.
                if label not in majority_spans:
                    majority_spans[label]= [(span_a, label)]

    definitive= []
    for _label, _annotations in majority_spans.items():
        for _annotation in _annotations:
            if final_annotator_id is not None: all_data[_annotation]["annotator_id"]= final_annotator_id
            definitive.append(all_data[_annotation])
    return definitive # Return an empty list if no majority found

def create_gold_standard(df, final_annotator_id= "gold_standard"):
    """
    Create a gold standard corpus using majority voting.
    """
    gold_standard = []

    # Group annotations by text_id, start_idx, end_idx
    grouped = df.groupby(['text_id', 'label'])
    label_performance= {}
    
    annotator_ids= df.annotator_id.unique()
    for text_id, group in grouped:
        # Collect all spans for the same annotator
        annotation_spans = []
        for annotator_id in annotator_ids:
            _annotations= []
            for n_annotations in range(len(group[group.annotator_id == annotator_id])):
                _annotations.append(group[group.annotator_id == annotator_id].iloc[n_annotations].to_dict())
            annotation_spans.append(_annotations)
        
        # Check if there's a majority vote for this span
        gold_annotations= majority_vote(annotation_spans, final_annotator_id=final_annotator_id)
        gold_standard.extend(gold_annotations)

    return pd.DataFrame(gold_standard)

## Load the annotations

- **Disaggregated dataset**: disaggregated_annotations_ita and disaggregated_annotations_spa dataframes store the disaggregated annotations. 
- **Aggregated dataset**: ita_gold and spa_gold dataframes store the aggregated annotations. 

The dataset structure is as follows:
| text_id | text | annotator_id | purpose | label | comment | span | start_idx | end_idx | text_external_id |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |

### Disaggregated Dataset Load

In [None]:
ita_annotations_files= ["Data/annotations_italian_1.json",
                        "Data/annotations_italian_2.json"]

spa_annotations_files= ["Data/annotations_spanish_1.json",
                        "Data/annotations_spanish_2.json"]

disaggreement_annotations_files_spa= ["Data/annotations_spanish_disagreement.json"]


In [None]:
# Extract annotations from italian subsets
disaggregated_annotations_ita = []
for p in ita_annotations_files:
    disaggregated_annotations_ita.extend(extract_annotations(p))

disaggregated_annotations_ita = pd.DataFrame(disaggregated_annotations_ita)
disaggregated_annotations_ita.head(n=3)

In [None]:
# Extract annotations from spanish subsets
disaggregated_annotations_spa = []
for p in spa_annotations_files:
    disaggregated_annotations_spa.extend(extract_annotations(p))

disaggregated_annotations_spa = pd.DataFrame(disaggregated_annotations_spa)
disaggregated_annotations_spa.head(n=3)

### Inter-Annotator Agreement

#### Italian

In [None]:
# Italian annotator ids: 7, 9
gold_annotator_id_ita= 7
predictions_annotator_id_ita= 9

In [None]:
agreement_f1_scores_ita= f_scores_per_label(disaggregated_annotations_ita, gold_annotator_id_ita, predictions_annotator_id_ita)
print("Italian f1 scores per label")
agreement_f1_scores_ita

In [None]:
# Full agreement stats
agreement_in_n_sentences_ita= full_agreement_per_sentence(disaggregated_annotations_ita, gold_annotator_id_ita, predictions_annotator_id_ita)
print(f"From the {agreement_in_n_sentences_ita['correct']+agreement_in_n_sentences_ita['incorrect']} sentences in the Italian dataset, only {agreement_in_n_sentences_ita['correct']} have a high agreement (above 0.5 agreement). This means that {agreement_in_n_sentences_ita['incorrect']} have no full agreement.")

In [None]:
# Annotations agreement stats
agreement_in_annotations_ita= full_agreement_per_annotation(disaggregated_annotations_ita, gold_annotator_id_ita, predictions_annotator_id_ita)
print("The agreement between annotator for each label. This shows the number of annotations that have a high annotation (above 0.5 agreement) for each label in each sentence")
agreement_in_annotations_ita

#### Spanish

In [None]:
# Spanish annotator ids: 7, 27
gold_annotator_id_spa= 27
predictions_annotator_id_spa= 7

In [None]:
agreement_f1_scores_spa= f_scores_per_label(disaggregated_annotations_spa, gold_annotator_id_spa, predictions_annotator_id_spa)
print("Spanish f1 scores per label")
agreement_f1_scores_spa

In [None]:
# Full agreement stats
agreement_in_n_sentences_spa= full_agreement_per_sentence(disaggregated_annotations_spa, gold_annotator_id_spa, predictions_annotator_id_spa)
print(f"From the {agreement_in_n_sentences_spa['correct']+agreement_in_n_sentences_spa['incorrect']} sentences in the Spanish dataset, only {agreement_in_n_sentences_spa['correct']} have a high agreement (above 0.5 agreement). This means that {agreement_in_n_sentences_spa['incorrect']} have no full agreement.")

In [None]:
# Annotations agreement stats
agreement_in_annotations_spa= full_agreement_per_annotation(disaggregated_annotations_spa, gold_annotator_id_spa, predictions_annotator_id_spa)
print("The agreement between annotator for each label. This shows the number of annotations that have a high annotation (above 0.5 agreement) for each label in each sentence")
agreement_in_annotations_spa

#### Stats and Label Distribution

In [None]:
# F1 scores per label in both datasets and the number of annotations

_labels= sorted(list(set(agreement_f1_scores_ita["label"].unique().tolist() + agreement_f1_scores_spa["label"].unique().tolist())))
_f1_spanish= [ round(agreement_f1_scores_spa[agreement_f1_scores_spa["label"]==label]["f1_score"].iloc[0], 2) if label in agreement_f1_scores_spa["label"].tolist() else "-" for label in _labels]
_n_ann_spanish= [ agreement_f1_scores_spa[agreement_f1_scores_spa["label"]==label]["n_annotations"].iloc[0] if label in agreement_f1_scores_spa["label"].tolist() else "-" for label in _labels]
_f1_italian= [ round(agreement_f1_scores_ita[agreement_f1_scores_ita["label"]==label]["f1_score"].iloc[0], 2) if label in agreement_f1_scores_ita["label"].tolist() else "-" for label in _labels]
_n_ann_italian= [ agreement_f1_scores_ita[agreement_f1_scores_ita["label"]==label]["n_annotations"].iloc[0] if label in agreement_f1_scores_ita["label"].tolist() else "-" for label in _labels]

agreement_f1_scores_aggregated= pd.DataFrame({
    "labels": _labels,
    "Spanish F1": _f1_spanish,
    "Italian F1": _f1_italian,
    "Spanish n_ann": _n_ann_spanish,
    "Italian n_ann": _n_ann_italian
})

# print(agreement_f1_scores_aggregated.drop(columns=["Italian n_ann", "Spanish n_ann"]).to_latex(index=False)) # For replicating the paper table
agreement_f1_scores_aggregated

In [None]:
# Number of headlines with an acceptable f1 score (above 0.5) per label
merged_df = pd.merge(agreement_in_annotations_spa, agreement_in_annotations_ita, left_index=True, right_index=True, how='outer', suffixes=('_spa', '_ita'))
merged_df

In [None]:
# Number of annotations per label category for both Italian and Spanish datasets

# Convert 'Spanish n_ann' and 'Italian n_ann' to numeric, coercing errors to NaN (for the '-' entry)
agreement_f1_scores_aggregated['Spanish n_ann'] = pd.to_numeric(agreement_f1_scores_aggregated['Spanish n_ann'], errors='coerce')
agreement_f1_scores_aggregated['Italian n_ann'] = pd.to_numeric(agreement_f1_scores_aggregated['Italian n_ann'], errors='coerce')

# Target labels
_labels_grouped= ["dangerous", "derogatory", "entities", "vulnerable identities" ]

# Spanish
_spa_grp__vg= agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'].str.startswith('vulnerable group')]['Spanish n_ann'].sum() + \
         agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels']== "vulnerable identity"]['Spanish n_ann'].values[0]

_n_ann_spanish_grouped= [agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'] == "dangerous"]["Spanish n_ann"].sum(),
                         agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'] == "derogatory"]["Spanish n_ann"].sum(),
                         agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'].str.startswith('entity')]['Spanish n_ann'].sum(), 
                         _spa_grp__vg]

# Italian
_ita_grp__vg = agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'].str.startswith('vulnerable group')]['Italian n_ann'].sum() + \
               agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels']== "vulnerable identity"]['Italian n_ann'].values[0]

_n_ann_italian_grouped= [agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'] == "dangerous"]["Italian n_ann"].sum(),
                         agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'] == "derogatory"]["Italian n_ann"].sum(),
                         agreement_f1_scores_aggregated[agreement_f1_scores_aggregated['labels'].str.startswith('entity')]['Italian n_ann'].sum(),
                         _ita_grp__vg]

n_annotations_grouped= pd.DataFrame({"labels": _labels_grouped, "Spanish n_ann": _n_ann_spanish_grouped,"Italian n_ann": _n_ann_italian_grouped})

print("Label distribution of the disaggregated dataset grouped in categories")
print("---"*10)
# print(n_annotations_grouped.to_latex(index=False)) # For replicating the paper table
n_annotations_grouped

### Gold-Standard Dataset

In [None]:
gold_file_path_ita= "Data/ita_gold.csv"
gold_file_path_spa= "Data/spa_gold.csv"

#### Generation

In [None]:
# Generate Italian gold-standard dataset
ita_gold= create_gold_standard(disaggregated_annotations_ita)
ita_gold.to_csv(gold_file_path_ita, index=False)
ita_gold.head(n=3)

In [None]:
# Add the disaggreement annotation
spa_gold= disaggregated_annotations_spa.copy()
for ann in disaggreement_annotations_files_spa:
    pd.concat([spa_gold, pd.DataFrame(extract_annotations(ann))])

# Generate gold-standard dataset
spa_gold= create_gold_standard(spa_gold)
spa_gold.to_csv(gold_file_path_spa, index=False)
spa_gold.head(n=3)

#### Load

In [None]:
# Load Italian gold-standard dataset
ita_gold= pd.read_csv(gold_file_path_ita)

# Load Spanish gold-standard dataset
spa_gold= pd.read_csv(gold_file_path_spa)

#### Stats and Label distribution

In [None]:
# Number of annotations per label category for both Italian and Spanish datasets

_labels= sorted(list(set(spa_gold["label"].unique().tolist() + ita_gold["label"].unique().tolist())))

spa_gold_n_ann_aggregated= [len(spa_gold[(spa_gold["label"]==label)]) if label in list(spa_gold["label"]) else "-" for label in _labels]
ita_gold_n_ann_aggregated= [len(ita_gold[(ita_gold["label"]==label)]) if label in list(ita_gold["label"]) else "-" for label in _labels]

gold_aggregated= pd.DataFrame({"labels": _labels, 
                              "Spanish n_ann": spa_gold_n_ann_aggregated,
                              "Italian n_ann": ita_gold_n_ann_aggregated})

# Convert 'Spanish n_ann' and 'Italian n_ann' to numeric, coercing errors to NaN (for the '-' entry)
gold_aggregated['Spanish n_ann'] = pd.to_numeric(gold_aggregated['Spanish n_ann'], errors='coerce')
gold_aggregated['Italian n_ann'] = pd.to_numeric(gold_aggregated['Italian n_ann'], errors='coerce')

# Target labels
_labels_grouped= ["dangerous", "derogatory", "entities", "vulnerable identities" ]

# Spanish
_spa_grp__vg= gold_aggregated[gold_aggregated['labels'].str.startswith('vulnerable group')]['Spanish n_ann'].sum()

_n_ann_spanish_grouped= [gold_aggregated[gold_aggregated['labels'] == "dangerous"]["Spanish n_ann"].sum(),
                         gold_aggregated[gold_aggregated['labels'] == "derogatory"]["Spanish n_ann"].sum(),
                         gold_aggregated[gold_aggregated['labels'].str.startswith('entity')]['Spanish n_ann'].sum(), 
                         _spa_grp__vg]

# Italian
_ita_grp__vg = gold_aggregated[gold_aggregated['labels'].str.startswith('vulnerable group')]['Italian n_ann'].sum()

_n_ann_italian_grouped= [gold_aggregated[gold_aggregated['labels'] == "dangerous"]["Italian n_ann"].sum(),
                         gold_aggregated[gold_aggregated['labels'] == "derogatory"]["Italian n_ann"].sum(),
                         gold_aggregated[gold_aggregated['labels'].str.startswith('entity')]['Italian n_ann'].sum(),
                         _ita_grp__vg]

gold_gruped= pd.DataFrame({"labels": _labels_grouped, "Spanish n_ann": _n_ann_spanish_grouped,"Italian n_ann": _n_ann_italian_grouped})
 
print("Label distribution of the aggregated dataset")
print("---"*10)
# print(gold_gruped.to_latex(index=False)) # For replicating the paper table

gold_gruped

## Test LLM Performance

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
improtant_labels= ["dangerous", "derogatory", "entity", "vulnerable identity"] # Labels that are going to be shown (to reduce noise)

### Experiments definition

In [None]:
# Dataset adaptation. In the paper we reduce the work to four label categories: dangerous, derogatory, vulnerable identities and named entities
category_mapper= {
                'dangerous': "dangerous",
                'derogatory': "derogatory",

                'entity': "entity",
                'entity - person': "entity",
                'entity - group': "entity",
                'entity - organization': "entity",
                'entity - location': "entity",
                'entity - other': "entity",

                'vulnerable group - migrant': 'vulnerable identity',
                'vulnerable group - ethnic minority': 'vulnerable identity',
                'vulnerable group - religious minority': 'vulnerable identity',
                'vulnerable group - women': 'vulnerable identity',
                'vulnerable group - lgbtq+ community': 'vulnerable identity',
                'vulnerable group - other': 'vulnerable identity',
                'vulnerable identity': 'vulnerable identity'
}

In [None]:
def experiment_restrictive_zero_shot(df, model, model_name, prompts, task_name= "restrictive-zero-shot"):
    # Approach 1: restrictive-zero-shot 
    # The model knows which and how many tags are present in the sentence. It has to find the span corresponding to each one. Given a sentence and the tag, it finds the span.

    model_generated= []
    existing_answers= {}
    annotation_options= 5
    
    for annotations_index, annotations in tqdm(df.groupby(["text_id"])):
        if annotations_index not in existing_answers:
            existing_answers[annotations_index]= []
        for annotation_index, annotation in annotations.iterrows():
            
            prompt= {"question" : prompts[annotation.label],
                     "context" : annotation.text}
            
            answers= model(**prompt, top_k=annotation_options)
            
            for posible_anser_idx in range(annotation_options):
                answer= answers[posible_anser_idx]
                
                if ((answer["start"], answer["end"]), answer["answer"]) not in existing_answers[annotations_index]:
                    existing_answers[annotations_index].append(((answer["start"], answer["end"]), answer["answer"]))
                
                    _annotation= annotation.copy()
                    _annotation['annotator_id']= f"{task_name}/{model_name}"
                    _annotation['span']= answer["answer"]
                    _annotation['start_idx']= answer["start"]
                    _annotation['end_idx']= answer["end"]
                    _annotation['comment']= None
                    
                    model_generated.append(_annotation)
                    break
    return pd.DataFrame(model_generated)

def experiment_non_restrictive_zero_shot(df, model, model_name, prompts, task_name= "non-restrictive-zero-shot", score_treshold=0.002):
    # Approach 2: non-restrictive zero-shot
    # The model does not know how many or which tags are present in the text. It finds the span and the tags present. It is passed a sentence, finds tags and span.    
    
    annotation_max= 3 # number of annotations extracted per query to the model. Each annotation will be asumed to be from a different annotator to later create a majority vote.
    model_generated= []

    for annotations_index, annotations in tqdm(df.groupby(["text_id", "text"])):

        for label, question in prompts.items():
            prompt= {"question" : prompts[label],
                     "context" : annotations.iloc[0].text}

            answers= model(**prompt, top_k=annotation_max)

            for idx_answer, answer in enumerate(answers):
                if answer["score"] <= score_treshold: continue
                to_add= annotations.iloc[0].copy()
                to_add.annotator_id = f"{task_name}/{model_name}/{idx_answer}"
                to_add.label = label
                to_add.span = answer["answer"]
                to_add.start_idx = answer["start"]
                to_add.end_idx = answer["end"]
                if to_add.start_idx == 0 and to_add.end_idx == len(to_add.text):
                    continue # If full sentence is passed is not valid annotation
                model_generated.append(to_add)

    model_generated= pd.DataFrame(model_generated)
    model_generated= create_gold_standard(model_generated, final_annotator_id=f"{task_name}/{model_name}") # Aggregate all the annotations by majority vote

    return model_generated

def eval_experiments(df, model_name, task_name, gold_annotator_id= "gold_standard"):
    predictions_annotator_id= f"{task_name}/{model_name}"

    agreement_recall, _ =    recall(df, gold_annotator_id, predictions_annotator_id)
    agreement_precision, _ = precision(df, gold_annotator_id, predictions_annotator_id)
    f1_scores =              calculate_f1(agreement_recall, agreement_precision)
    
    for performance in f1_scores:
        performance["n_in_gold_standard"]= len(df[ (df["label"]==performance["label"]) & (df["annotator_id"]==gold_annotator_id) ])
        performance["n_in_predictions"]=   len(df[ (df["label"]==performance["label"]) & (df["annotator_id"]==predictions_annotator_id) ])

    return pd.DataFrame(f1_scores).sort_values('label', ascending=True, ignore_index=True)

### Italian

In [None]:
# This are all the prompts created for each label. However, as we work only with four labels, only those prompts will be used
ita_prompts= {
                'dangerous': "Quale parte del testo è pericolosa (criminalizza, ridicolizza, incita alla violenza, ...) nei confronti di identità vulnerabili (donne, migranti, minoranze etniche, ...)?",
                'derogatory': "Quale parte del testo è dispregiativa (commenti negativi o denigratori sulla vittima senza un esplicito discorso d'odio, ma in cui la menzione stessa è discriminatoria o offensiva e spesso usa un tono volto a sminuire o screditare la vittima)?",
            
                'entity': "Quale entità nominata è menzionata nella frase?",
                'entity - person': "¿Quale persona è menzionata nella frase?",
                'entity - group': "Quale gruppo è menzionato nella frase?",
                'entity - organization': "Quale organizzazione è menzionata nella frase?",
                'entity - location': "Quale luogo è menzionato nella frase?",
                'entity - other': "Quale entità che non sia una persona, un gruppo, un'organizzazione o un luogo è menzionata nella frase?",
                
                'vulnerable entity': "Quale persona, gruppo, organizzazione, luogo, ecc. vulnerabile ai discorsi d'odio è menzionato nella frase?",
                'vulnerable group - migrant': "Quale parte del testo parla di migranti?",
                'vulnerable group - ethnic minority': "Quale parte del testo parla di minoranze etniche?",
                'vulnerable group - religious minority': "Quale parte del testo parla di minoranze religiose?",
                'vulnerable group - women': "Quale parte del testo parla di donne?",
                'vulnerable group - lgbtq+ community': "Quale parte del testo parla di persone LGBTQ+",
                'vulnerable group - other': "Quale parte del testo parla di gruppi vulnerabili che non siano migranti, minoranze etniche o religiose, donne, comunità LGBTQ+?",
                'vulnerable identity': "Quale identità vulnerabile ai discorsi d'odio è menzionata nella frase?"
}

In [None]:
# Dataset adaptation. In the paper we reduce the work to four label categories: dangerous, derogatory, vulnerable identities and named entities
ita_gold= ita_gold[ita_gold["label"]!='vulnerable entity'] # Vulnerable entities are out of scope
ita_gold["label"]= ita_gold["label"].map(category_mapper)

In [None]:
models_ita= ["gsarti/it5-base", "morenolq/bart-it"] # Models that will be used
models_results_ita= {}

In [None]:
for qa_italian_model in models_ita: 
    model= pipeline("question-answering", qa_italian_model, device=device)
    r_z_s_ita_df= experiment_restrictive_zero_shot( ita_gold, model=model, model_name=qa_italian_model, prompts=ita_prompts) # Restrictive zero shot experiment
    nr_z_s_ita_df= experiment_non_restrictive_zero_shot(ita_gold, model=model, model_name=qa_italian_model, prompts=ita_prompts) # Non restrictive zero shot experiment
    ita_df= pd.concat([ita_gold, r_z_s_ita_df, nr_z_s_ita_df])

    results_nr_z_s= eval_experiments(ita_df, qa_italian_model, task_name="non-restrictive-zero-shot", gold_annotator_id="gold_standard")
    results_nr_z_s= results_nr_z_s[results_nr_z_s["label"].isin(improtant_labels)]

    results_r_z_s= eval_experiments(ita_df, qa_italian_model, task_name="restrictive-zero-shot", gold_annotator_id="gold_standard")
    results_r_z_s= results_r_z_s[results_r_z_s["label"].isin(improtant_labels)]

    models_results_ita[qa_italian_model]= {"non-restrictive-zero-shot": results_nr_z_s, "restrictive-zero-shot":results_r_z_s}


In [None]:
# ita_df.to_csv("Data/annotations_ita.csv", index=False) # you can save the execution to a csv
ita_df.head(n=3)

### Spanish

In [None]:
# This are all the prompts created for each label. However, as we work only with four labels, only those prompts will be used

spa_prompts= {
                'dangerous': "¿Qué parte del texto es peligroso (criminaliza, ridiculiza, incita a la violencia, ...) contra identidades vulnerables (mujeres, migrantes, minorías étnicas, ...)?",
                'derogatory': "¿Qué parte del texto es derogativo (comentarios negativos o despectivos sobre la víctima sin incitación explícita al odio, pero la mención en sí es discriminatoria u ofensiva, y a menudo emplea un tono destinado a menospreciar o desacreditar a la víctima)?",

                'entity': "¿Qué entidad nombrada se menciona en la frase?",
                'entity - person': "¿Qué persona se menciona en la frase?",
                'entity - group': "¿Qué grupo se menciona en la frase?",
                'entity - organization': "¿Qué organización se menciona en la frase?",
                'entity - location': "¿Qué lugar se menciona en la frase?",
                'entity - other': "¿Qué entidad que no es una persona, grupo, organización ni lugar se menciona en la frase?",
                'vulnerable entity': "¿Qué persona, grupo, organización, lugar, etc. vulnerable al discurso de odio se menciona en la frase?",

                'vulnerable group - migrant': "¿Qué parte del texto habla sobre los migrantes?",
                'vulnerable group - ethnic minority': "¿Qué parte del texto habla sobre minorías étnicas?",
                'vulnerable group - religious minority': "¿Qué parte del texto habla sobre minorías religiosas?",
                'vulnerable group - women': "¿Qué parte del texto habla sobre las mujeres?",
                'vulnerable group - lgbtq+ community': "¿Qué parte del texto habla sobre el colectivo LGBTQ+?",
                'vulnerable group - other': "¿Qué parte del texto habla sobre algún grupo vulnerable que no sean migrantes, minorías étnicas o religiosas, mujeres, colectivo LGBTQ+?",
                'vulnerable identity': "¿Qué identidad vulnerable al discurso de odio se menciona en la frase?",
}

In [None]:
# Dataset adaptation. In the paper we reduce the work to four label categories: dangerous, derogatory, vulnerable identities and named entities
spa_gold= spa_gold[spa_gold["label"]!='vulnerable entity'] # Vulnerable entities are out of scope
spa_gold["label"]= spa_gold["label"].map(category_mapper)

In [None]:
models_spa= ["vgaraujov/t5-base-spanish", "vgaraujov/bart-base-spanish"] # Models that will be used
models_results_spa= {}

In [None]:
for qa_spanish_model in models_spa: 
    model= pipeline("question-answering", qa_spanish_model, device=device)
    r_z_s_spa_df= experiment_restrictive_zero_shot(     spa_gold, model=model, model_name=qa_spanish_model, prompts=spa_prompts) # Restrictive Zero shot experiment
    nr_z_s_spa_df= experiment_non_restrictive_zero_shot(spa_gold, model=model, model_name=qa_spanish_model, prompts=spa_prompts) # Non Restrictive Zero shot experiment
    spa_df= pd.concat([spa_gold, r_z_s_spa_df, nr_z_s_spa_df])
    
    results_nr_z_s= eval_experiments(spa_df, qa_spanish_model, task_name="non-restrictive-zero-shot", gold_annotator_id="gold_standard")
    results_nr_z_s= results_nr_z_s[results_nr_z_s["label"].isin(improtant_labels)]
    
    results_r_z_s= eval_experiments(spa_df, qa_spanish_model, task_name="restrictive-zero-shot", gold_annotator_id="gold_standard")
    results_r_z_s= results_r_z_s[results_r_z_s["label"].isin(improtant_labels)]

    models_results_spa[qa_spanish_model]= {"non-restrictive-zero-shot": results_nr_z_s, "restrictive-zero-shot":results_r_z_s}
        

In [None]:
# spa_df.to_csv("Data/annotations_spa.csv", index=False) # you can save the execution to a csv
spa_df.head(n=3)

## Evaluate

In [None]:
# Non Restrictive Zero-shot T5
nr_z_s_t5= pd.DataFrame({
    "labels": models_results_ita["gsarti/it5-base"]["non-restrictive-zero-shot"]["label"],
    "spanish": models_results_spa["vgaraujov/t5-base-spanish"]["non-restrictive-zero-shot"]["f1_score"].round(2),
    "italian": models_results_ita["gsarti/it5-base"]["non-restrictive-zero-shot"]["f1_score"].round(2)
})
print("Non-Restrictive Zero-Shot Experiments With T5 Model")
print(nr_z_s_t5.to_latex(index=False)) # Paper table values

In [None]:
# Non Restrictive Zero-shot BART
nr_z_s_bart= pd.DataFrame({
    "labels": models_results_ita["morenolq/bart-it"]["non-restrictive-zero-shot"]["label"],
    "spanish": models_results_spa["vgaraujov/bart-base-spanish"]["non-restrictive-zero-shot"]["f1_score"].round(2),
    "italian": models_results_ita["morenolq/bart-it"]["non-restrictive-zero-shot"]["f1_score"].round(2)
})
print("Non-Restrictive Zero-Shot Experiments With BART Model")
print(nr_z_s_bart.to_latex(index=False)) # Paper table values

In [None]:
# Restrictive Zero-shot T5
r_z_s_t5= pd.DataFrame({
    "labels": models_results_ita["gsarti/it5-base"]["restrictive-zero-shot"]["label"],
    "spanish": models_results_spa["vgaraujov/t5-base-spanish"]["restrictive-zero-shot"]["f1_score"].round(2),
    "italian": models_results_ita["gsarti/it5-base"]["restrictive-zero-shot"]["f1_score"].round(2)
})
print("Restrictive Zero-Shot Experiments With T5 Model")
print(r_z_s_t5.to_latex(index=False)) # Paper table values

In [None]:
# Restrictive Zero-shot BART
r_z_s_bart= pd.DataFrame({
    "labels": models_results_ita["morenolq/bart-it"]["restrictive-zero-shot"]["label"],
    "spanish": models_results_spa["vgaraujov/bart-base-spanish"]["restrictive-zero-shot"]["f1_score"].round(2),
    "italian": models_results_ita["morenolq/bart-it"]["restrictive-zero-shot"]["f1_score"].round(2)
})
print("Restrictive Zero-Shot Experiments With BART Model")
print(r_z_s_bart.to_latex(index=False)) # Paper table values