In [109]:
import json

#input and output file names
JSON_FILE_NAME = "FLanT5_predictions_k_3_processed"
OUTPUT_FILE_NAME = "test3_benchmarks"

#load json file
f = open(f'{JSON_FILE_NAME}.json')
output = json.load(f)

#print(json.dumps(output, indent=4))

In [110]:
import re
import copy
from collections import defaultdict

def preprocessing(json_data):

  def parse_entry(entry):
    # Regular expression to capture the entity and years
    match = re.match(r"(.+?)\s*\(([\d,\s\u2013\-]*)\)?", entry)
    if match:
        entity = match.group(1).strip()  # Extract the entity name
        timeline = match.group(2).strip() if match.group(2) else ""  # Extract the timeline if present
        if timeline:
            # Attempt to split timeline into valid integers, ignoring invalid entries
            years = []
            for part in timeline.split(","):

                try:
                    # Support ranges (e.g., "2010–2017")
                    if "–" in part:
                        start, end = map(int, part.split("–"))
                        years.extend(range(start, end + 1))
                    else:
                        years.append(int(part))
                except ValueError:
                    pass  # Skip invalid entries
            return {entity: years}
        else:
            return {entity: []}  # No timeline or invalid timeline
    else:
        # If the pattern doesn't match, assume the entire entry is an entity with no timeline
        return {entry.strip(): []}

  json_data_copy = copy.deepcopy(json_data)

  for x in json_data_copy:
    x["generated_answer"] = list(dict.fromkeys(x["generated_answer"])) # remove duplicates
    parsed_generated_answer = [parse_entry(entry) for entry in x["generated_answer"]]
    parsed_ground_truth = [parse_entry(entry) for entry in x["ground_truth"]]

    x["generated_answer"] = parsed_generated_answer
    x["ground_truth"] = parsed_ground_truth

  return json_data_copy

In [111]:
#Entity-level evaluation

def entity_precision_recall_f1_EM(data):
  total_correct_generated = 0
  total_generated = 0
  total_ground_truth = 0
  exact_match = 0
  total = 0

  for entry in data:

      generated_entities = {list(d.keys())[0] for d in entry["generated_answer"]}
      ground_truth_entities = {list(d.keys())[0] for d in entry["ground_truth"]}
      correct_entities = generated_entities.intersection(ground_truth_entities)

      total_correct_generated += len(correct_entities)
      total_generated += len(generated_entities)
      total_ground_truth += len(ground_truth_entities)

      if generated_entities == ground_truth_entities:
        exact_match += 1

      total += 1

  precision = total_correct_generated / total_generated
  recall = total_correct_generated / total_ground_truth
  f1 = 0.0
  if (precision > 0.0 or recall > 0.0):
    f1 = 2 * precision * recall / (precision + recall)
  EM = (exact_match / total)

  return precision, recall, f1, EM

In [112]:
from collections import Counter

#Timeline-level evaluation

def timeline_precision_recall_f1(data):
  total_correct_generated = 0
  total_generated = 0
  total_ground_truth = 0

  for entry in data:

      #print(entry)
      generated_years = [list(d.values())[0] for d in entry["generated_answer"]]
      generated_years  = [year for sublist in generated_years for year in sublist]

      ground_truth_years = [list(d.values())[0] for d in entry["ground_truth"]]
      ground_truth_years  = [year for sublist in ground_truth_years for year in sublist]

      def match_lists(list1, list2):
        match_count = 0
        list2_copy = list2.copy()

        for item in list1:
            if item in list2_copy:
                match_count += 1
                list2_copy.remove(item)
        return match_count

      correct_years = match_lists(generated_years, ground_truth_years)

      total_correct_generated += correct_years
      total_generated += len(generated_years)
      total_ground_truth += len(ground_truth_years)

  precision = total_correct_generated / total_generated
  recall = total_correct_generated / total_ground_truth
  f1 = 0.0
  if (precision > 0.0 or recall > 0.0):
    f1 = 2 * precision * recall / (precision + recall)

  return precision, recall, f1

def timeline_EM(data):

  def check_timeline_EM(generated_timelines, ground_truth_timelines):
    # Sort the inner lists
    sorted_list1 = [sorted(inner_list) for inner_list in generated_timelines]
    sorted_list2 = [sorted(inner_list) for inner_list in ground_truth_timelines]

    # Sort the outer lists and compare
    return sorted(sorted_list1) == sorted(sorted_list2)

  exact_match = 0
  total = 0

  for entry in data:
    generated_years = [list(d.values())[0] for d in entry["generated_answer"]]
    #print(generated_years)

    ground_truth_years = [list(d.values())[0] for d in entry["ground_truth"]]

    if check_timeline_EM(generated_years, ground_truth_years):
      exact_match += 1

    total += 1

  EM = (exact_match / total)
  return EM

In [113]:
# change list of dictionaries into one dictionary for comparisons
def combine_generated_answers(data):
    data_copy = copy.deepcopy(data)
    combined = {}

    # Iterate over each dictionary in the list
    for item in data_copy["generated_answer"]:
        for key, value in item.items():
            if key in combined:
                # Merge and remove duplicates
                combined[key] = sorted(set(combined[key] + value))
            else:
                combined[key] = value

    # Replace the list of dictionaries with the combined result
    data_copy["generated_answer"] = [{k: v} for k, v in combined.items()]
    return data_copy

In [114]:
# Combined evaluation

def combined_precision_recall_f1_EM(data):
  total_correct_generated = 0
  total_generated = 0
  total_ground_truth = 0
  exact_match = 0
  total = 0

  for entry in data:
    # change list of dictionaries into one dictionary for comparisons
    generated_answer_dict = {}
    ground_truth_dict = {}

    for d in entry['generated_answer']:
      generated_answer_dict.update(d)

    for d in entry['ground_truth']:
      ground_truth_dict.update(d)

    correct_generated = sum(1 for k, v in generated_answer_dict.items() if ground_truth_dict.get(k) == v)

    total_correct_generated += correct_generated
    total_generated += len(generated_answer_dict)
    total_ground_truth += len(ground_truth_dict)

    if (generated_answer_dict == ground_truth_dict):
      exact_match += 1
    total += 1

  precision = total_correct_generated / total_generated
  recall = total_correct_generated / total_ground_truth
  f1 = 0.0
  if (precision > 0.0 or recall > 0.0):
    f1 = 2 * precision * recall / (precision + recall)
  EM = (exact_match / total)

  return precision, recall, f1, EM

In [115]:
preprocessed_data = preprocessing(output)
#print(json.dumps(preprocessed_data, indent=4))

In [116]:
# print all fully correct answers

#for entry in preprocessed_data:
#  if entry['generated_answer'] == entry['ground_truth']:
#    print(entry)

In [117]:
entity_results = entity_precision_recall_f1_EM(preprocessed_data)
timeline_results_pre_rec_f1 = timeline_precision_recall_f1(preprocessed_data)
timeline_results_EM = timeline_EM(preprocessed_data)

new_preprocessed_data = [combine_generated_answers(entry) for entry in preprocessed_data]
combined_results = combined_precision_recall_f1_EM(new_preprocessed_data)

In [118]:
result_string =  "########################\n"
result_string += "Entity evaluation\n"
result_string += "########################\n"

result_string += f"Precision:\t\t {entity_results[0]}\n"
result_string += f"Recall (completeness):\t {entity_results[1]}\n"
result_string += f"f1:\t\t\t {entity_results[2]}\n"
result_string += f"EM:\t\t\t {entity_results[3]}\n\n"

result_string += "########################\n"
result_string += "Timeline evaluation\n"
result_string += "########################\n"

result_string += f"Precision:\t\t {timeline_results_pre_rec_f1[0]}\n"
result_string += f"Recall (completeness):\t {timeline_results_pre_rec_f1[1]}\n"
result_string += f"f1:\t\t\t {timeline_results_pre_rec_f1[2]}\n"
result_string += f"EM:\t\t\t {timeline_results_EM}\n\n"

result_string += "########################\n"
result_string += "Combined evaluation\n"
result_string += "########################\n"

result_string += f"Precision:\t\t {combined_results[0]}\n"
result_string += f"Recall (completeness):\t {combined_results[1]}\n"
result_string += f"f1:\t\t\t {combined_results[2]}\n"
result_string += f"EM:\t\t\t {combined_results[3]}\n\n"

print(result_string)

f = open(f"{OUTPUT_FILE_NAME}.txt", "w")
f.write(result_string)
f.close()

print(f'Results saved to {OUTPUT_FILE_NAME}.txt')

########################
Entity evaluation
########################
Precision:		 0.05070842654735272
Recall (completeness):	 0.021518987341772152
f1:			 0.03021550766496334
EM:			 0.0009337068160597573

########################
Timeline evaluation
########################
Precision:		 0.7835351089588378
Recall (completeness):	 0.8061783756851021
f1:			 0.7946954813359528
EM:			 0.022408963585434174

########################
Combined evaluation
########################
Precision:		 0.005965697240865026
Recall (completeness):	 0.002531645569620253
f1:			 0.0035547656076427457
EM:			 0.0


Results saved to test3_benchmarks.txt
