In [36]:
wd = '/mnt/openfact/users/msawinski/factue-task2'
# wd = '/Users/marcinsawinski/Documents/GitHub/factue-task2'
import sys, os
os.chdir(wd)

In [37]:
from factue.methods.llm_calls import load_template_list
for x in load_template_list(job='persuasion', step="detect", prompt_version='v001').keys():
    print(f'"{x}"')

In [38]:
import pandas as pd
from pathlib import Path
import os
root = Path("data/llm_output/persuasion")
# Example path template
files = root.rglob("*/*.parquet")

# Read all files and add file path as a column
df_list = []
for f in files:
    df_part = pd.read_parquet(f)
    df_part['source_file'] = f  # add the file path
    df_list.append(df_part)

# Combine into one DataFrame
df = pd.concat(df_list, ignore_index=True)
df['split'] = df.source_file.astype(str).str.split('/',expand=True)[7]

In [39]:
df.head(3)

In [44]:
df.pred.value_counts()

In [40]:
df[df.split=='train'].groupby(["prompt_name","text_lang","model_name","split"])['filename'].agg("count").reset_index()

In [45]:
def normalize_binary(x):
    return 1 if str(x).strip().lower() in {'1', 'true'} else 0

In [47]:
df['source_file'].value_counts().sort_index()
df['gold'] = df['gold'].apply(normalize_binary)
df['pred'] = df['pred'].apply(normalize_binary)
df['split'] = df.source_file.astype(str).str.split('/',expand=True)[7]

In [48]:
df.prompt_name.value_counts()

In [49]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# Define a function to compute metrics for a group
# Assume df has columns: prompt_name, text_lang, gold, pred
rows = []

def is_valid_label(x):
    return x in (0, 1)

rows = []

for prompt_name in df['prompt_name'].unique():
    for text_lang in df.loc[df['prompt_name'] == prompt_name, 'text_lang'].unique():
        for model_name in df.loc[(df['prompt_name'] == prompt_name) & (df['text_lang'] == text_lang), 'model_name'].unique():
            group = df[
                (df['prompt_name'] == prompt_name) &
                (df['text_lang'] == text_lang) &
                (df['model_name'] == model_name)
            ]
            if len(group) > 0:
                row = {
                    'prompt_name': prompt_name,
                    'text_lang': text_lang,
                    'model_name': model_name,
                    'accuracy': accuracy_score(group['gold'], group['pred']),
                    'precision': precision_score(group['gold'], group['pred'], zero_division=0),
                    'recall': recall_score(group['gold'], group['pred'], zero_division=0),
                    'f1': f1_score(group['gold'], group['pred'], zero_division=0),
                    'support': len(group)
                }
                rows.append(row)

results = pd.DataFrame(rows)
results[['accuracy', 'precision','recall', 'f1']] = results[['accuracy', 'precision','recall', 'f1']].round(2)
results

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
fig, axes = plt.subplots(len(metrics), 1, figsize=(10, 18), sharex=True)

for i, metric in enumerate(metrics):
    ax = axes[i]
    sns.barplot(
        data=results,
        y='technique_id',
        x=metric,
        hue='text_lang',
        ax=ax,
        palette='muted',
        errorbar=None  # fallback for compatibility
    )
    ax.set_title(metric.capitalize())
    ax.set_xlim(0, 1.05)
    ax.set_ylabel('Technique ID')
    ax.set_xlabel('Score')
    ax.legend(title='Text Language', loc='lower right')

plt.tight_layout()
plt.show()

In [None]:
# # Create a melt of metrics for faceted visualization
# metrics_df = results.melt(
#     id_vars=['technique_id', 'text_lang'],
#     value_vars=['accuracy', 'precision', 'recall', 'f1'],
#     var_name='metric',
#     value_name='score'
# )

# # Plot
# plt.figure(figsize=(12, 6))
# sns.barplot(data=metrics_df, x='technique_id', y='score', hue='text_lang', palette='muted', errorbar=None)
# plt.title('Metrics by Technique and Language')
# plt.ylim(0, 1.05)
# plt.ylabel('Score')
# plt.xlabel('Technique ID')
# plt.legend(title='Text Language')
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.tight_layout()
# plt.show()

# total

In [None]:
# Metrics
acc = accuracy_score(df['gold'], df['pred'])
precision = precision_score(df['gold'], df['pred'], zero_division=0)
recall = recall_score(df['gold'], df['pred'], average='binary')
f1 = f1_score(df['gold'], df['pred'], average='binary')

print(f"Accuracy: {acc:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")