In [127]:
import json

import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [128]:
with open("data/dataset.json") as file:
    dataset = json.load(file)

MODELS = ["gpt-4o-2024-05-13", "gpt-4o-mini-2024-07-18", "mistral-nemo:12b", "gemma2:9b", "llama3.1:8b", "mistral-small:22b", "gemma2:27b", "llama3.1:70b"]

for number in [0, 1, 3, "cot"]:
    for model in MODELS:
        run_name = model + f"_{str(number)}shot"
        for index, message in enumerate(dataset):
            if not Path(f"output/{run_name}.json").exists():
                continue
            with open(f"output/{run_name}.json") as file:
                data = json.load(file)
                try:
                    if "sentiment" in data[index]:
                        message["tools"][run_name] = data[index]['sentiment']
                    else:
                        message["tools"][run_name] = "invalid"
                except:
                    message["tools"][run_name] = "invalid"

In [129]:
expected = [message["part2_aggregate"]["polarity"] if message["part2_aggregate"]["polarity"] != "undefined" else message["discussion_polarity"] for message in dataset]

actual = {}

for tool in dataset[0]["tools"].keys():
    actual[tool] = [x["tools"][tool] for x in dataset]
    
    if tool == "SentiCR":
        actual[tool] = [x if "negative" else "neutral" for x in actual[tool]]

In [130]:
labels = ["positive", "negative", "neutral"]

for tool in actual.keys():
    cm = confusion_matrix(expected, actual[tool], labels=labels)
    
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    print(f"Confusion Matrix for {tool}:")
    print(cm_df)
    print()

Confusion Matrix for SentiStrength:
          positive  negative  neutral
positive       411        80       30
negative        87       301       44
neutral        245       287      306

Confusion Matrix for SentiStrengthSE:
          positive  negative  neutral
positive       328       143       50
negative        16       345       71
neutral         70       304      464

Confusion Matrix for SentiCR:
          positive  negative  neutral
positive         0        62      459
negative         0       172      260
neutral          0       168      670

Confusion Matrix for DEVA:
          positive  negative  neutral
positive       377       103       41
negative        49       304       79
neutral        141       221      476

Confusion Matrix for Senti4SD:
          positive  negative  neutral
positive       371        34      116
negative        86       184      162
neutral        200       103      535

Confusion Matrix for gpt-4o-2024-05-13_0shot:
          positive  negativ

In [131]:
for tool in actual.keys():
    # Calculate Precision, Recall, and F1-score for each category
    precision = precision_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    recall = recall_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    f1_scores = f1_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    
    # Create a DataFrame for the metrics
    metrics_df = pd.DataFrame({
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_scores
    }, index=labels).round(2)
    
    print(f"\nMetrics per Category for {tool}:")
    print(metrics_df)
    
    macro_f1 = f1_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_f1 = f1_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_precision = precision_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_precision = precision_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_recall = recall_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_recall = recall_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    included_count = sum(1 for item in actual[tool] if item in labels)
    discarded_count = len(actual[tool]) - included_count
    
    print("Macro Precision:", round(macro_precision, 2))
    print("Micro Precision:", round(micro_precision,2))
    print("Macro Recall:   ", round(macro_recall,2))
    print("Micro Recall:   ", round(micro_recall,2))
    print("Macro F1 Score: ", round(macro_f1,2))
    print("Micro F1 Score: ", round(micro_f1,2))
    print("N:", included_count)


Metrics per Category for SentiStrength:
          Precision  Recall  F1 Score
positive       0.55    0.79      0.65
negative       0.45    0.70      0.55
neutral        0.81    0.37      0.50
Macro Precision: 0.6
Micro Precision: 0.57
Macro Recall:    0.62
Micro Recall:    0.57
Macro F1 Score:  0.57
Micro F1 Score:  0.57
N: 1791

Metrics per Category for SentiStrengthSE:
          Precision  Recall  F1 Score
positive       0.79    0.63      0.70
negative       0.44    0.80      0.56
neutral        0.79    0.55      0.65
Macro Precision: 0.67
Micro Precision: 0.63
Macro Recall:    0.66
Micro Recall:    0.63
Macro F1 Score:  0.64
Micro F1 Score:  0.63
N: 1791

Metrics per Category for SentiCR:
          Precision  Recall  F1 Score
positive        NaN     0.0      0.00
negative       0.43     0.4      0.41
neutral        0.48     0.8      0.60
Macro Precision: 0.46
Micro Precision: 0.47
Macro Recall:    0.4
Micro Recall:    0.47
Macro F1 Score:  0.34
Micro F1 Score:  0.47
N: 1791

Metric