Loads the libraries necessary for the script.

In [None]:
import json

import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

Loads the data from the dataset and included in it the data for all of the models (and prompt engineering techniques that were collected)

In [None]:
with open("data/dataset.json") as file:
    dataset = json.load(file)

MODELS = ["gpt-4o-2024-05-13", "gpt-4o-mini-2024-07-18", "mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", "gemma2:27b", "llama3.1:70b"]

for number in [0, 1, 3, "cot"]:
    for model in MODELS:
        run_name = model + f"_{str(number)}shot"
        for index, message in enumerate(dataset):
            if not Path(f"output/{run_name}.json").exists():
                continue
            with open(f"output/{run_name}.json") as file:
                data = json.load(file)
                try:
                    if "sentiment" in data[index]:
                        message["tools"][run_name] = data[index]['sentiment']
                    else:
                        message["tools"][run_name] = "invalid"
                except:
                    message["tools"][run_name] = "invalid"

Created the data structures required by sklearn to generate a confusion matrix.

In [None]:
expected = [message["part2_aggregate"]["polarity"] if message["part2_aggregate"]["polarity"] != "undefined" else message["discussion_polarity"] for message in dataset]

actual = {}

for tool in dataset[0]["tools"].keys():
    actual[tool] = [x["tools"][tool] for x in dataset]
    
    if tool == "SentiCR":
        actual[tool] = [x if "negative" else "neutral" for x in actual[tool]]

Generated and prints the confusion matrix for each model and prompt engineering technique.

In [None]:
labels = ["positive", "negative", "neutral"]

for tool in actual.keys():
    cm = confusion_matrix(expected, actual[tool], labels=labels)
    
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    print(f"Confusion Matrix for {tool}:")
    print(cm_df)
    print()

Prints the precision, recall, and f1-score for each of the models and prompt engineering techniques, utilizing sklearn.

In [None]:
for tool in actual.keys():
    # Calculate Precision, Recall, and F1-score for each category
    precision = precision_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    recall = recall_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    f1_scores = f1_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    
    # Create a DataFrame for the metrics
    metrics_df = pd.DataFrame({
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_scores
    }, index=labels).round(2)
    
    print(f"\nMetrics per Category for {tool}:")
    print(metrics_df)
    
    macro_f1 = f1_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_f1 = f1_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_precision = precision_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_precision = precision_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_recall = recall_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_recall = recall_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    included_count = sum(1 for item in actual[tool] if item in labels)
    discarded_count = len(actual[tool]) - included_count
    
    print("Macro Precision:", round(macro_precision, 2))
    print("Micro Precision:", round(micro_precision,2))
    print("Macro Recall:   ", round(macro_recall,2))
    print("Micro Recall:   ", round(micro_recall,2))
    print("Macro F1 Score: ", round(macro_f1,2))
    print("Micro F1 Score: ", round(micro_f1,2))
    print("N:", included_count)