Loads the libraries necessary for the script.

In [None]:
import json

import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

Loads the data from the dataset and included in it the data for all of the models (and prompt engineering techniques that were collected)

In [None]:
with open("data/dataset.json") as file:
    dataset = json.load(file)

MODELS = ["gpt-4o-2024-05-13", "gpt-4o-mini-2024-07-18", "mistral-nemo-12b", "gemma2-9b", "llama3.1-8b", "mistral-small-22b", "gemma2-27b", "llama3.1-70b", "deepseek-r1-8b", "deepseek-r1-32b"]

for number in [0, 1, 3, "cot"]:
    for model in MODELS:
        run_name = model + f"_{str(number)}shot"
        if not Path(f"output_premo/{run_name}.json").exists():
            continue
        with open(f"output_premo/{run_name}.json") as file:
            data = json.load(file)
        for index, message in enumerate(dataset):
            try:
                if "sentiment" in data[index]:
                    message["tools"][run_name] = data[index]['sentiment']
                else:
                    message["tools"][run_name] = "invalid"
            except:
                message["tools"][run_name] = "invalid"

Created the data structures required by sklearn to generate a confusion matrix.

In [None]:
expected = [message["part2_aggregate"]["polarity"] if message["part2_aggregate"]["polarity"] != "undefined" else message["discussion_polarity"] for message in dataset]

actual = {}

for tool in dataset[0]["tools"].keys():
    actual[tool] = [x["tools"][tool] for x in dataset]
    
    if tool == "SentiCR":
        actual[tool] = [x if "negative" else "neutral" for x in actual[tool]]

Generated and prints the confusion matrix for each model and prompt engineering technique.

In [None]:
labels = ["positive", "negative", "neutral"]

for tool in actual.keys():
    cm = confusion_matrix(expected, actual[tool], labels=labels)
    
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    print(f"Confusion Matrix for {tool}:")
    print(cm_df)
    print()

Prints the precision, recall, and f1-score for each of the models and prompt engineering techniques, utilizing sklearn.

In [None]:
for tool in actual.keys():
    # Calculate Precision, Recall, and F1-score for each category
    precision = precision_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    recall = recall_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    f1_scores = f1_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    
    # Create a DataFrame for the metrics
    metrics_df = pd.DataFrame({
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_scores
    }, index=labels).round(2)
    
    print(f"\nMetrics per Category for {tool}:")
    print(metrics_df)
    
    macro_f1 = f1_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_f1 = f1_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_precision = precision_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_precision = precision_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_recall = recall_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_recall = recall_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    included_count = sum(1 for item in actual[tool] if item in labels)
    discarded_count = len(actual[tool]) - included_count
    
    print("Macro Precision:", round(macro_precision, 2))
    print("Micro Precision:", round(micro_precision,2))
    print("Macro Recall:   ", round(macro_recall,2))
    print("Micro Recall:   ", round(micro_recall,2))
    print("Macro F1 Score: ", round(macro_f1,2))
    print("Micro F1 Score: ", round(micro_f1,2))
    print("N:", included_count)

Generate LaTeX Table (for RQ1)

In [None]:
baseline_tools = {k: v for k, v in actual.items() if '0shot' in k}

model_data = {}
for tool in baseline_tools.keys():
    model_name = tool.replace('_0shot', '')
    model_data[model_name] = tool

sorted_models = sorted(model_data.keys())

all_metrics = {}
for model in sorted_models:
    tool = model_data[model]
    precision = precision_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    recall = recall_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    f1_scores = f1_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
    
    macro_f1 = f1_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_f1 = f1_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_precision = precision_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_precision = precision_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    macro_recall = recall_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
    micro_recall = recall_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
    
    included_count = sum(1 for item in actual[tool] if item in labels)
    
    all_metrics[model] = {
        'precision': precision,
        'recall': recall,
        'f1': f1_scores,
        'macro_precision': macro_precision,
        'micro_precision': micro_precision,
        'macro_recall': macro_recall,
        'micro_recall': micro_recall,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        'n': included_count
    }

best_values = {
    'pos_prec': max(m['precision'][0] for m in all_metrics.values()),
    'pos_rec': max(m['recall'][0] for m in all_metrics.values()),
    'pos_f1': max(m['f1'][0] for m in all_metrics.values()),
    'neg_prec': max(m['precision'][1] for m in all_metrics.values()),
    'neg_rec': max(m['recall'][1] for m in all_metrics.values()),
    'neg_f1': max(m['f1'][1] for m in all_metrics.values()),
    'neu_prec': max(m['precision'][2] for m in all_metrics.values()),
    'neu_rec': max(m['recall'][2] for m in all_metrics.values()),
    'neu_f1': max(m['f1'][2] for m in all_metrics.values()),
    'macro_prec': max(m['macro_precision'] for m in all_metrics.values()),
    'macro_rec': max(m['macro_recall'] for m in all_metrics.values()),
    'macro_f1': max(m['macro_f1'] for m in all_metrics.values()),
    'micro_prec': max(m['micro_precision'] for m in all_metrics.values()),
    'micro_rec': max(m['micro_recall'] for m in all_metrics.values()),
    'micro_f1': max(m['micro_f1'] for m in all_metrics.values()),
}

table_data = []
for model in sorted_models:
    metrics = all_metrics[model]
    model_display = model.replace('-2024-05-13', '').replace('-2024-07-18', '')
    
    def fmt(value, best):
        val_str = f"{int(value*100)}\\%"
        if value == best:
            return f"\\textbf{{{val_str}}}"
        return val_str
    
    table_data.append([f"{model_display} (n={metrics['n']})", "Positive", 
                       fmt(metrics['precision'][0], best_values['pos_prec']),
                       fmt(metrics['recall'][0], best_values['pos_rec']),
                       fmt(metrics['f1'][0], best_values['pos_f1'])])
    table_data.append([f"{model_display} (n={metrics['n']})", "Negative", 
                       fmt(metrics['precision'][1], best_values['neg_prec']),
                       fmt(metrics['recall'][1], best_values['neg_rec']),
                       fmt(metrics['f1'][1], best_values['neg_f1'])])
    table_data.append([f"{model_display} (n={metrics['n']})", "Neutral", 
                       fmt(metrics['precision'][2], best_values['neu_prec']),
                       fmt(metrics['recall'][2], best_values['neu_rec']),
                       fmt(metrics['f1'][2], best_values['neu_f1'])])
    table_data.append([f"{model_display} (n={metrics['n']})", "Macro Avg.", 
                       fmt(metrics['macro_precision'], best_values['macro_prec']),
                       fmt(metrics['macro_recall'], best_values['macro_rec']),
                       fmt(metrics['macro_f1'], best_values['macro_f1'])])
    table_data.append([f"{model_display} (n={metrics['n']})", "Micro Avg.", 
                       fmt(metrics['micro_precision'], best_values['micro_prec']),
                       fmt(metrics['micro_recall'], best_values['micro_rec']),
                       fmt(metrics['micro_f1'], best_values['micro_f1'])])

df = pd.DataFrame(table_data, columns=['Model', 'Class', 'Precision', 'Recall', 'F1-score'])

latex_table = df.to_latex(
    index=False, 
    escape=False,
    column_format='|l|l|r|r|r|',
    caption="Zero-shot performance for all models tested, for the premo dataset",
    label="tab:results"
)

lines = latex_table.split('\n')
processed_lines = []
current_model = None
row_count = 0
first_model = True
is_header = False

for i, line in enumerate(lines):
    if 'Model &' in line and 'Class &' in line:
        is_header = True
        line = line.replace('Model', '\\textbf{Model}')
        line = line.replace('Class', '\\textbf{Class}')
        line = line.replace('Precision', '\\textbf{Precision}')
        line = line.replace('Recall', '\\textbf{Recall}')
        line = line.replace('F1-score', '\\textbf{F1-score}')
        processed_lines.append(line)
        continue
    
    if '&' in line and 'toprule' not in line and 'midrule' not in line and 'bottomrule' not in line:
        parts = line.split('&')
        if len(parts) >= 5:
            model_name = parts[0].strip()
            
            if model_name and model_name != '':
                if model_name != current_model:
                    if not first_model:
                        processed_lines.append('\\hline')
                    first_model = False
                    
                    current_model = model_name
                    row_count = 1
                    parts[0] = f" \\multirow{{5}}{{*}}{{{model_name}}}"
                else:
                    row_count += 1
                    parts[0] = " "
                    
                line = ' & '.join(parts)
                
                if row_count < 5 and line.strip().endswith('\\\\'):
                    line = line.rstrip() + ' \\cline{2-5}'
    
    processed_lines.append(line)

latex_table = '\n'.join(processed_lines)

latex_table = latex_table.replace('\\toprule', '\\hline')
latex_table = latex_table.replace('\\midrule', '\\hline')
latex_table = latex_table.replace('\\bottomrule', '\\hline')

print(latex_table)

Generate LaTeX Tables for Prompt Engineering Techniques (for RQ2)

In [None]:
for shot_type in ['1shot', '3shot', 'cotshot']:
    shot_tools = {k: v for k, v in actual.items() if shot_type in k}
    
    shot_model_data = {}
    for tool in shot_tools.keys():
        model_name = tool.replace(f'_{shot_type}', '')
        shot_model_data[model_name] = tool
    
    sorted_shot_models = sorted(shot_model_data.keys())
    
    shot_metrics = {}
    for model in sorted_shot_models:
        tool = shot_model_data[model]
        precision = precision_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
        recall = recall_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
        f1_scores = f1_score(expected, actual[tool], labels=labels, average=None, zero_division=np.nan)
        
        macro_f1 = f1_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
        micro_f1 = f1_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
        macro_precision = precision_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
        micro_precision = precision_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
        macro_recall = recall_score(expected, actual[tool], average='macro', zero_division=np.nan, labels=labels)
        micro_recall = recall_score(expected, actual[tool], average='micro', zero_division=np.nan, labels=labels)
        
        included_count = sum(1 for item in actual[tool] if item in labels)
        
        shot_metrics[model] = {
            'precision': precision,
            'recall': recall,
            'f1': f1_scores,
            'macro_precision': macro_precision,
            'micro_precision': micro_precision,
            'macro_recall': macro_recall,
            'micro_recall': micro_recall,
            'macro_f1': macro_f1,
            'micro_f1': micro_f1,
            'n': included_count
        }
    
    best_shot_values = {
        'pos_prec': max(m['precision'][0] for m in shot_metrics.values()),
        'pos_rec': max(m['recall'][0] for m in shot_metrics.values()),
        'pos_f1': max(m['f1'][0] for m in shot_metrics.values()),
        'neg_prec': max(m['precision'][1] for m in shot_metrics.values()),
        'neg_rec': max(m['recall'][1] for m in shot_metrics.values()),
        'neg_f1': max(m['f1'][1] for m in shot_metrics.values()),
        'neu_prec': max(m['precision'][2] for m in shot_metrics.values()),
        'neu_rec': max(m['recall'][2] for m in shot_metrics.values()),
        'neu_f1': max(m['f1'][2] for m in shot_metrics.values()),
        'macro_prec': max(m['macro_precision'] for m in shot_metrics.values()),
        'macro_rec': max(m['macro_recall'] for m in shot_metrics.values()),
        'macro_f1': max(m['macro_f1'] for m in shot_metrics.values()),
        'micro_prec': max(m['micro_precision'] for m in shot_metrics.values()),
        'micro_rec': max(m['micro_recall'] for m in shot_metrics.values()),
        'micro_f1': max(m['micro_f1'] for m in shot_metrics.values()),
    }
    
    table_data = []
    for model in sorted_shot_models:
        if model not in all_metrics:
            continue
            
        metrics = shot_metrics[model]
        baseline = all_metrics[model]
        model_display = model.replace('-2024-05-13', '').replace('-2024-07-18', '')
        
        def fmt_with_diff(value, baseline_value, best):
            val_str = f"{int(value*100)}\\%"
            if value == best:
                val_str = f"\\textbf{{{val_str}}}"
            diff = (value - baseline_value) * 100
            rounded_diff = round(diff)
            sign = "+" if rounded_diff > 0 else ""
            val_str += f" ({sign}{rounded_diff}\\%)"
            return val_str
        
        table_data.append([f"{model_display} (n={metrics['n']})", "Positive", 
                           fmt_with_diff(metrics['precision'][0], baseline['precision'][0], best_shot_values['pos_prec']),
                           fmt_with_diff(metrics['recall'][0], baseline['recall'][0], best_shot_values['pos_rec']),
                           fmt_with_diff(metrics['f1'][0], baseline['f1'][0], best_shot_values['pos_f1'])])
        table_data.append([f"{model_display} (n={metrics['n']})", "Negative", 
                           fmt_with_diff(metrics['precision'][1], baseline['precision'][1], best_shot_values['neg_prec']),
                           fmt_with_diff(metrics['recall'][1], baseline['recall'][1], best_shot_values['neg_rec']),
                           fmt_with_diff(metrics['f1'][1], baseline['f1'][1], best_shot_values['neg_f1'])])
        table_data.append([f"{model_display} (n={metrics['n']})", "Neutral", 
                           fmt_with_diff(metrics['precision'][2], baseline['precision'][2], best_shot_values['neu_prec']),
                           fmt_with_diff(metrics['recall'][2], baseline['recall'][2], best_shot_values['neu_rec']),
                           fmt_with_diff(metrics['f1'][2], baseline['f1'][2], best_shot_values['neu_f1'])])
        table_data.append([f"{model_display} (n={metrics['n']})", "Macro Avg.", 
                           fmt_with_diff(metrics['macro_precision'], baseline['macro_precision'], best_shot_values['macro_prec']),
                           fmt_with_diff(metrics['macro_recall'], baseline['macro_recall'], best_shot_values['macro_rec']),
                           fmt_with_diff(metrics['macro_f1'], baseline['macro_f1'], best_shot_values['macro_f1'])])
        table_data.append([f"{model_display} (n={metrics['n']})", "Micro Avg.", 
                           fmt_with_diff(metrics['micro_precision'], baseline['micro_precision'], best_shot_values['micro_prec']),
                           fmt_with_diff(metrics['micro_recall'], baseline['micro_recall'], best_shot_values['micro_rec']),
                           fmt_with_diff(metrics['micro_f1'], baseline['micro_f1'], best_shot_values['micro_f1'])])
    
    df = pd.DataFrame(table_data, columns=['Model', 'Class', 'Precision', 'Recall', 'F1-score'])
    
    shot_label = shot_type.replace('shot', '-shot') if 'cot' not in shot_type else 'CoT'
    latex_table = df.to_latex(
        index=False, 
        escape=False,
        column_format='|l|l|r|r|r|',
        caption=f"Performance with {shot_label} prompting for the PRemo dataset (improvement vs baseline shown in parentheses)",
        label=f"tab:results_premo_{shot_type}"
    )
    

    lines = latex_table.split('\n')
    processed_lines = []
    current_model = None
    row_count = 0
    first_model = True
    
    for i, line in enumerate(lines):
        if 'Model &' in line and 'Class &' in line:
            line = line.replace('Model', '\\textbf{Model}')
            line = line.replace('Class', '\\textbf{Class}')
            line = line.replace('Precision', '\\textbf{Precision}')
            line = line.replace('Recall', '\\textbf{Recall}')
            line = line.replace('F1-score', '\\textbf{F1-score}')
            processed_lines.append(line)
            continue
        
        if '&' in line and 'toprule' not in line and 'midrule' not in line and 'bottomrule' not in line:
            parts = line.split('&')
            if len(parts) >= 5:
                model_name = parts[0].strip()
                
                if model_name and model_name != '':
                    if model_name != current_model:
                        if not first_model:
                            processed_lines.append('\\hline')
                        first_model = False
                        
                        current_model = model_name
                        row_count = 1
                        parts[0] = f" \\multirow{{5}}{{*}}{{{model_name}}}"
                    else:
                        row_count += 1
                        parts[0] = " "
                        
                    line = ' & '.join(parts)
                    
                    if row_count < 5 and line.strip().endswith('\\\\'):
                        line = line.rstrip() + ' \\cline{2-5}'
        
        processed_lines.append(line)
    
    latex_table = '\n'.join(processed_lines)
    
    latex_table = latex_table.replace('\\toprule', '\\hline')
    latex_table = latex_table.replace('\\midrule', '\\hline')
    latex_table = latex_table.replace('\\bottomrule', '\\hline')
    
    print(f"\n{'='*80}")
    print(f"Table for {shot_type}")
    print('='*80)
    print(latex_table)
    print()