In [10]:
import json
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

In [219]:
main_file = "ZeroShot-Guideline"
# Choose either DocRED Label or ACE05 Label
label_dataset = "ace05" # docred
# Choose dataset either MEN-Dataset, DocRED-Dataset
dataset = "men" # docred 
folder = f"notebook-{dataset}-rel-{label_dataset}"

In [220]:
OUTPUT_CHATGPT = f"../{folder}/rel_annotation_by_chatgpt/{main_file}.json"
with open(OUTPUT_CHATGPT,"r") as f:
    data = json.load(f)

In [221]:
if label_dataset == "ace05":
    list_labels = pd.read_csv("./guideline/Relation Labels - ACE05.csv")["Relation Label"].to_list()
if label_dataset == "docred":
    list_labels = pd.read_csv("./guideline/Relation Labels - DocRED.csv")["Relation Label"].to_list()

In [222]:
# Initialize lists to store predicted and actual relations
precision_recall_f1 = []

# Extract predicted and actual relations from the sample data
for idx in range(len(data)):
    chatgpt_ordered = []
    gold_ordered = []
    if data[idx]["chatgpt"] == "Failed to extract relation from JSON":
        data[idx]["chatgpt"] = []
  
    gold = [rel for rel in data[idx]["gold"] if rel["relation"] in list_labels]
    chatgpt = [rel for rel in data[idx]["chatgpt"] if rel["relation"] in list_labels]
    
    # Create sets of predicted and true relations
    predicted_relations_set = set(tuple(sorted(rel['entity_pair'].values())) for rel in chatgpt)
    true_relations_set = set(tuple(sorted(rel['entity_pair'].values())) for rel in gold)
    
    # Calculate TP, FP, FN
    TP = len(predicted_relations_set.intersection(true_relations_set))
    FP = len(predicted_relations_set.difference(true_relations_set))
    FN = len(true_relations_set.difference(predicted_relations_set))
           
    # Calculate precision, recall, and F1-Score
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    
    if precision + recall == 0:
        f1_score = 0.0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    precision_recall_f1.append({
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1_score
    })

In [223]:
# Initialize variables to keep track of totals
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0

# Loop through each dictionary
for entry in precision_recall_f1:
    total_precision += entry['Precision']
    total_recall += entry['Recall']
    total_f1 += entry['F1-Score']

# Calculate the weighted average F1-score
total_entries = len(precision_recall_f1)
weighted_average_f1 = total_f1 / total_entries

print("Weighted Average Precision:", round(total_precision / total_entries,3))
print("Weighted Average Recall:", round(total_recall / total_entries,3))
print("Weighted Average F1-Score:", round(weighted_average_f1,3))

Weighted Average Precision: 0.264
Weighted Average Recall: 0.665
Weighted Average F1-Score: 0.356
