<a href="https://colab.research.google.com/github/nicolaCirillo/ate-it/blob/main/evaluation/subtask_a_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import codecs
import json
from collections import defaultdict

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a dictionary
  where keys are (doc_id, par_id, sent_id) tuples and values are
  lists of terms.

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A dictionary containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    df.fillna('', inplace=True) # Fill NaN values with empty strings
    data = defaultdict(list)
    # Iterate over rows and extract terms
    for doc_id, par_id, sent_id, _, term in df.itertuples(index=False):
      if term.strip() != '':
        data[(doc_id, par_id, sent_id)].append(term.strip())
      else:
         data[(doc_id, par_id, sent_id)]
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = {(row["document_id"], int(row["paragraph_id"]), int(row["sentence_id"])): row["term_list"]
            for row in json_data["data"]}
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

In [7]:
def micro_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Precision, Recall,
  and F1 score based on individual term matching (micro-average).

  Args:
    gold_standard: A list of lists, where each inner list contains the
                   gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Precision, Recall, and F1 score.
  """
  total_true_positives = 0
  total_false_positives = 0
  total_false_negatives = 0

  # Iterate through each item's gold standard and system output terms
  for gold, system in zip(gold_standard, system_output):
    # Convert to sets for efficient comparison
    gold_set = set(gold)
    system_set = set(system)

    # Calculate True Positives, False Positives, and False Negatives for the current item
    true_positives = len(gold_set.intersection(system_set))
    false_positives = len(system_set - gold_set)
    false_negatives = len(gold_set - system_set)

    # Accumulate totals across all items
    total_true_positives += true_positives
    total_false_positives += false_positives
    total_false_negatives += false_negatives

  # Calculate Precision, Recall, and F1 score (micro-average)
  precision = total_true_positives / (total_true_positives + total_false_positives) if (total_true_positives + total_false_positives) > 0 else 0
  recall = total_true_positives / (total_true_positives + total_false_negatives) if (total_true_positives + total_false_negatives) > 0 else 0
  f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return precision, recall, f1

In [8]:
def type_f1_score(gold_standard, system_output):
  """
  Evaluates a term extraction system's performance using Type Precision,
  Type Recall, and Type F1 score based on the set of unique terms extracted
  at least once across the entire dataset.

  Args:
    gold_standard: A list of lists, where each inner list contains the
                   gold standard terms for an item.
    system_output: A list of lists, where each inner list contains the
                   terms extracted by the system for the corresponding item.

  Returns:
    A tuple containing the Type Precision, Type Recall, and Type F1 score.
  """

  # Get the set of all unique gold standard terms across the dataset
  all_gold_terms = set()
  for item_terms in gold_standard:
    all_gold_terms.update(item_terms)

  # Get the set of all unique system extracted terms across the dataset
  all_system_terms = set()
  for item_terms in system_output:
    all_system_terms.update(item_terms)

  # Calculate True Positives (terms present in both sets)
  type_true_positives = len(all_gold_terms.intersection(all_system_terms))

  # Calculate False Positives (terms in system output but not in gold standard)
  type_false_positives = len(all_system_terms - all_gold_terms)

  # Calculate False Negatives (terms in gold standard but not in system output)
  type_false_negatives = len(all_gold_terms - all_system_terms)

  # Calculate Type Precision, Type Recall, and Type F1 score
  type_precision = type_true_positives / (type_true_positives + type_false_positives) if (type_true_positives + type_false_positives) > 0 else 0
  type_recall = type_true_positives / (type_true_positives + type_false_negatives) if (type_true_positives + type_false_negatives) > 0 else 0
  type_f1 = 2 * (type_precision * type_recall) / (type_precision + type_recall) if (type_precision + type_recall) > 0 else 0

  return type_precision, type_recall, type_f1

In [None]:
# Get file paths for system output and gold standard from user input
SYSTEM_OUTPUT_PATH = input("Enter the path to the system output file: ")
GOLD_STANDARD_PATH = input("Enter the path to the gold standard file: ")

In [None]:
# Load data from the specified file paths
system_output_dict = load_data(SYSTEM_OUTPUT_PATH)
gold_standard_dict = load_data(GOLD_STANDARD_PATH)

# Extract the term lists from the loaded data dictionaries
gold_standard = []
system_output = []
for k in gold_standard_dict:
    gold_standard.append(gold_standard_dict[k])
    system_output.append(system_output_dict[k])

# Calculate and print micro-averaged precision, recall, and F1 score
precision, recall, f1 = micro_f1_score(gold_standard, system_output)
print("micro-averaged Precision:", round(precision, 3))
print("micro-averaged Recall:", round(recall, 3))
print("micro-averaged F1 score:", round(f1, 3))

# Calculate and print Type Precision, Type Recall, and Type F1 score
type_precision, type_recall, type_f1 = type_f1_score(gold_standard, system_output)
print("\nType Precision:", round(type_precision, 3))
print("Type Recall:", round(type_recall, 3))
print("Type F1 score:", round(type_f1, 3))

In [11]:
# Evaluate every submission

import os
# Load gold standard data
gold_standard_dict = load_data('subtask_a_test_labeled.json')


# Test models from the 'submission' folder

for filename in os.listdir('submission'):
    if filename.endswith('.json'):
        system_output_dict = load_data('submission/' + filename)

        print(f'\n\n{filename}')

        # Extract the term lists from the loaded data dictionaries
        gold_standard = []
        system_output = []
        for k in gold_standard_dict:
            gold_standard.append(gold_standard_dict[k])
            system_output.append(system_output_dict.get(k, []))

        # Calculate and print micro-averaged precision, recall, and F1 score
        precision, recall, f1 = micro_f1_score(gold_standard, system_output)
        print("micro-averaged Precision:", round(precision, 3))
        print("micro-averaged Recall:", round(recall, 3))
        print("micro-averaged F1 score:", round(f1, 3))

        # Calculate and print Type Precision, Type Recall, and Type F1 score
        type_precision, type_recall, type_f1 = type_f1_score(gold_standard, system_output)
        print("\nType Precision:", round(type_precision, 3))
        print("Type Recall:", round(type_recall, 3))
        print("Type F1 score:", round(type_f1, 3))




baseline_a_test.json
micro-averaged Precision: 0.497
micro-averaged Recall: 0.559
micro-averaged F1 score: 0.526

Type Precision: 0.435
Type Recall: 0.508
Type F1 score: 0.469


Peacemaker_subtask_A_run1.json
micro-averaged Precision: 0.497
micro-averaged Recall: 0.476
micro-averaged F1 score: 0.486

Type Precision: 0.43
Type Recall: 0.455
Type F1 score: 0.442


TrietNLP_A_1.json
micro-averaged Precision: 0.634
micro-averaged Recall: 0.568
micro-averaged F1 score: 0.599

Type Precision: 0.599
Type Recall: 0.545
Type F1 score: 0.571


OA-TE_A_2.json
micro-averaged Precision: 0.581
micro-averaged Recall: 0.522
micro-averaged F1 score: 0.55

Type Precision: 0.569
Type Recall: 0.492
Type F1 score: 0.528


OA-TE_A_1.json
micro-averaged Precision: 0.56
micro-averaged Recall: 0.447
micro-averaged F1 score: 0.497

Type Precision: 0.595
Type Recall: 0.415
Type F1 score: 0.489


msk_a_1.json
micro-averaged Precision: 0.569
micro-averaged Recall: 0.476
micro-averaged F1 score: 0.519

Type Preci