# **First Step**
## **Zero-Shot prompt engineering for Naive and JSON output for itens**

In [None]:
import pandas as pd
import re

df = pd.read_csv('data/res_step_one_clean.csv')


In [None]:
def extract_info(output):
    cot = re.search(r'"CoT":\s*"([^"]*)"', output)
    answer = re.search(r'"answer":\s*"([^"]*)"', output)
    justification = re.search(r'"justification":\s*"([^"]*)"', output)
    alternative_answer = re.search(r'"alternative_answer":\s*"([^"]*)"', output)

    cot_value = cot.group(1) if cot else None
    answer_value = answer.group(1) if answer else None
    justification_value = justification.group(1) if justification else None
    alternative_answer_value = alternative_answer.group(1) if alternative_answer else None

    # Count the number of steps in the CoT by counting the number of sentences (assuming sentences end with a period)
    cot_steps = None
    if cot_value:
        # Count sentences by counting periods
        cot_steps = len(re.findall(r'[0-9]{1}\.', cot_value))

    # Calculate extra_text by removing the matched spans
    extra_text = output
    if cot:
        extra_text = extra_text.replace(cot.group(0), "")
    if answer:
        extra_text = extra_text.replace(answer.group(0), "")
    if justification:
        extra_text = extra_text.replace(justification.group(0), "")
    if alternative_answer:
        extra_text = extra_text.replace(alternative_answer.group(0), "")
    
    extra_text_value = re.sub(r'\{.*?\}', '', extra_text, flags=re.DOTALL).strip()
    extra_text_value = extra_text_value.replace('```json\n\n```', '')

    return cot_value, answer_value, alternative_answer_value, justification_value, extra_text_value, cot_steps

df_cp = df.copy()
df_cp[['CoT', 'model_answer', 'alternative_response', 'justification', 'extra_text', 'cot_steps']] = df_cp['output'].apply(lambda x: pd.Series(extract_info(x)))
df_cp = df_cp.drop('output', axis=1)
df_cp['model_alternative_answer'] = df_cp.apply(lambda row: 'E' if row['alternative_response'] is not None else row['model_answer'], axis=1)
df_cp['hit'] = (df_cp['answer'] == df_cp['model_answer']).astype(int)
df_cp['hit_alternative'] = (df_cp['answer'] == df_cp['model_alternative_answer']).astype(int)
df_cp = df_cp[['source', 'item', 'answer', 'r', 'model', 'prompt_type', 'model_answer', 'hit', 'model_alternative_answer', 'hit_alternative', 'alternative_response', 'justification', 'extra_text', 'CoT', 'cot_steps']]
df_cp.to_csv('data/tidydata2cmr.csv', index=False)

display(df_cp.head())
display(df_cp.tail())


# **Classical Metrics Report - Traditional Approach**

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import scikit_posthocs as sp
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from IPython.display import display

def calculate_overall_metrics(y_true, y_pred):
    """Calculates and displays overall metrics."""
    valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
    y_true = y_true[valid]
    y_pred = y_pred[valid]

    overall_metrics = {
        "Accuracy": [round(accuracy_score(y_true, y_pred), 4)],
        "Precision": [round(precision_score(y_true, y_pred, average='weighted', zero_division=0), 4)],
        "Recall": [round(recall_score(y_true, y_pred, average='weighted', zero_division=0), 4)],
        "F1-Score": [round(f1_score(y_true, y_pred, average='weighted', zero_division=0), 4)]
    }

    print("ðŸ”¹ Overall Metrics:")
    display(pd.DataFrame(overall_metrics))
    return y_true, y_pred

def calculate_confusion_matrix(y_true, y_pred):
    """Calculates and displays the confusion matrix."""
    labels_all = sorted(list(set(y_true) | set(y_pred)))
    cm = pd.DataFrame(confusion_matrix(y_true, y_pred, labels=labels_all), index=labels_all, columns=labels_all)
    print("ðŸ”¹ Overall Confusion Matrix:")
    display(cm)
    return labels_all

def calculate_group_metrics(df, group_column, answer_col='answer', model_answer_col='model_answer'):
    """Calculates and displays aggregated metrics by group.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        group_column (str): The name of the column to group by.
        answer_col (str): The column containing the true answers.
        model_answer_col (str): The column containing the model's answers.
    """
    print(f"ðŸ”¹ Aggregated Metrics by {group_column}:")

    group_metrics = []
    group_names = df[group_column].unique()

    for group_name in group_names:
        group_data = df[df[group_column] == group_name]
        yt = group_data[answer_col].to_numpy()
        yp = group_data[model_answer_col].to_numpy()
        valid = ~pd.isna(yt) & ~pd.isna(yp)
        yt = yt[valid]
        yp = yp[valid]

        if len(yt) == 0:
            continue

        report = classification_report(yt, yp, output_dict=True, zero_division=0)

        row = {
            group_column.capitalize(): group_name,
            "Accuracy": round(report.get("accuracy", np.nan), 4),
            "Macro_Precision": round(report["macro avg"]["precision"], 4),
            "Macro_Recall": round(report["macro avg"]["recall"], 4),
            "Macro_F1": round(report["macro avg"]["f1-score"], 4),
            "Weighted_Precision": round(report["weighted avg"]["precision"], 4),
            "Weighted_Recall": round(report["weighted avg"]["recall"], 4),
            "Weighted_F1": round(report["weighted avg"]["f1-score"], 4)
        }

        group_metrics.append(row)

    df_summary = pd.DataFrame(group_metrics)
    display(df_summary)
    return group_names

def calculate_f1_matrix(df, labels_all, group_names, group_column, answer_col='answer', model_answer_col='model_answer'):
    """Calculates and displays the F1 matrix per class.

    Args:
        df (pd.DataFrame): The input DataFrame.
        labels_all (list): List of all possible labels.
        group_names (list): List of group names.
        group_column (str): The name of the column to group by.
        answer_col (str): The column containing the true answers.
        model_answer_col (str): The column containing the model's answers.
    """
    print(f"ðŸ”¹ Class-level F1-Score Matrix by {group_column}:")

    f1_matrix = []

    for label in labels_all:
        f1_row = []
        for group_name in group_names:
            data = df[df[group_column] == group_name]
            yt = data[answer_col].to_numpy()
            yp = data[model_answer_col].to_numpy()
            valid = ~pd.isna(yt) & ~pd.isna(yp)
            yt = yt[valid]
            yp = yp[valid]

            if len(yt) == 0:
                f1_row.append(np.nan)
                continue

            report = classification_report(yt, yp, output_dict=True, zero_division=0)
            f1 = report.get(label, {}).get('f1-score', 0.0)
            f1_row.append(round(f1, 4))

        f1_matrix.append(f1_row)

    f1_scores = np.array(f1_matrix)
    df_f1_matrix = pd.DataFrame(f1_scores, index=labels_all, columns=group_names)
    display(df_f1_matrix)
    return f1_scores, group_names

def perform_friedman_nemenyi(f1_scores, group_names):
    """Performs Friedman test and Nemenyi post-hoc test."""
    # Filter valid groups
    f1_scores_clean = np.array(f1_scores)
    valid_columns = ~np.isnan(f1_scores_clean).any(axis=0)
    f1_scores_clean = f1_scores_clean[:, valid_columns]
    valid_group_names = np.array(group_names)[valid_columns]

    # Friedman test
    print("ðŸ”¹ Friedman Test on Class-level F1-Scores:")
    stat, p = stats.friedmanchisquare(*f1_scores_clean)
    print(f"  Ï‡Â² = {stat:.3f}, p = {p:.4f}")

    # Nemenyi post-hoc test
    if p < 0.05:
        print("ðŸ”¹ Nemenyi Post-hoc Test (p < 0.05):")
        nemenyi_result = sp.posthoc_nemenyi_friedman(f1_scores_clean)
        display(nemenyi_result)
        print(valid_group_names)


In [None]:
# Load the dataset
df = pd.read_csv('data/tidydata2cmr.csv')
df['model_prompt_type'] = df['model'] + '_' + df['prompt_type']

In [None]:
answer_col = 'answer'
model_answer_col = 'model_answer'


# --- Overall metrics ---
group_type = 'prompt_type'
y_true = df[answer_col].to_numpy()
y_pred = df[model_answer_col].to_numpy()

valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
y_true = y_true[valid]
y_pred = y_pred[valid]

calculate_overall_metrics(y_true, y_pred)

# --- Confusion matrix ---
labels_all = calculate_confusion_matrix(y_true, y_pred)

# --- Aggregated metrics by group ---
group_names = calculate_group_metrics(df, group_type, answer_col, model_answer_col)
f1_scores, group_names = calculate_f1_matrix(df, labels_all, group_names, group_type, answer_col, model_answer_col)

perform_friedman_nemenyi(f1_scores, group_names)


# --- Overall metrics ---
group_type = 'model'
y_true = df[answer_col].to_numpy()
y_pred = df[model_answer_col].to_numpy()

valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
y_true = y_true[valid]
y_pred = y_pred[valid]

calculate_overall_metrics(y_true, y_pred)

# --- Confusion matrix ---
labels_all = calculate_confusion_matrix(y_true, y_pred)

# --- Aggregated metrics by group ---
group_names = calculate_group_metrics(df, group_type, answer_col, model_answer_col)
f1_scores, group_names = calculate_f1_matrix(df, labels_all, group_names, group_type, answer_col, model_answer_col)

perform_friedman_nemenyi(f1_scores, group_names)


# --- Overall metrics ---
group_type = 'model_prompt_type'
y_true = df[answer_col].to_numpy()
y_pred = df[model_answer_col].to_numpy()

valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
y_true = y_true[valid]
y_pred = y_pred[valid]

calculate_overall_metrics(y_true, y_pred)

# --- Confusion matrix ---
labels_all = calculate_confusion_matrix(y_true, y_pred)

# --- Aggregated metrics by group ---
group_names = calculate_group_metrics(df, group_type, answer_col, model_answer_col)
f1_scores, group_names = calculate_f1_matrix(df, labels_all, group_names, group_type, answer_col, model_answer_col)

perform_friedman_nemenyi(f1_scores, group_names)

In [None]:
answer_col = 'answer'
model_answer_col = 'model_alternative_answer'


# --- Overall metrics ---
group_type = 'prompt_type'
y_true = df[answer_col].to_numpy()
y_pred = df[model_answer_col].to_numpy()

valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
y_true = y_true[valid]
y_pred = y_pred[valid]

calculate_overall_metrics(y_true, y_pred)

# --- Confusion matrix ---
labels_all = calculate_confusion_matrix(y_true, y_pred)

# --- Aggregated metrics by group ---
group_names = calculate_group_metrics(df, group_type, answer_col, model_answer_col)
f1_scores, group_names = calculate_f1_matrix(df, labels_all, group_names, group_type, answer_col, model_answer_col)

perform_friedman_nemenyi(f1_scores, group_names)


# --- Overall metrics ---
group_type = 'model'
y_true = df[answer_col].to_numpy()
y_pred = df[model_answer_col].to_numpy()

valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
y_true = y_true[valid]
y_pred = y_pred[valid]

calculate_overall_metrics(y_true, y_pred)

# --- Confusion matrix ---
labels_all = calculate_confusion_matrix(y_true, y_pred)

# --- Aggregated metrics by group ---
group_names = calculate_group_metrics(df, group_type, answer_col, model_answer_col)
f1_scores, group_names = calculate_f1_matrix(df, labels_all, group_names, group_type, answer_col, model_answer_col)

perform_friedman_nemenyi(f1_scores, group_names)


# --- Overall metrics ---
group_type = 'model_prompt_type'
y_true = df[answer_col].to_numpy()
y_pred = df[model_answer_col].to_numpy()

valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
y_true = y_true[valid]
y_pred = y_pred[valid]

calculate_overall_metrics(y_true, y_pred)

# --- Confusion matrix ---
labels_all = calculate_confusion_matrix(y_true, y_pred)

# --- Aggregated metrics by group ---
group_names = calculate_group_metrics(df, group_type, answer_col, model_answer_col)
f1_scores, group_names = calculate_f1_matrix(df, labels_all, group_names, group_type, answer_col, model_answer_col)

perform_friedman_nemenyi(f1_scores, group_names)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.utils import resample

def calculate_weighted_f1_by_model_and_prompt(df, model_col, prompt_col, answer_col, model_answer_col, n_iterations=100):
    """
    Calculates weighted F1-score for each (model, prompt_type) combination using bootstrap sampling.
    Returns a DataFrame with columns: model, prompt, weighted_f1 (one per bootstrap).
    """
    results = []
    for (model, prompt), group in df.groupby([model_col, prompt_col]):
        for _ in range(n_iterations):
            sample = resample(group, replace=True)
            y_true = sample[answer_col].to_numpy()
            y_pred = sample[model_answer_col].to_numpy()
            valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
            y_true = y_true[valid]
            y_pred = y_pred[valid]
            if len(y_true) > 0 and len(np.unique(y_true)) > 1:
                score = f1_score(y_true, y_pred, average='weighted')
                results.append({'model': model, 'prompt': prompt, 'weighted_f1': score})
    return pd.DataFrame(results)

def plot_weighted_f1_by_model_and_prompt(f1_df, title="Weighted F1-score by model and prompt type (CMR)"):
    """
    Plots a grouped barplot with error bars (std deviation) by model and prompt.
    """
    model_name_map = {
        "model_gpt_4o_mini": "4o-mini",
        "model_gpt_41_nano": "4.1-nano",
        "model_claude_35_haiku": "3-haiku",
        "model_grok_3_mini_beta": "grok-3-mini",
        "model_ds_v3": "ds-v3"
    }

    f1_df = f1_df.dropna()
    f1_df['model'] = f1_df['model'].map(model_name_map)

    summary_df = (
        f1_df.groupby(['model', 'prompt'])
        .agg(mean_f1=('weighted_f1', 'mean'), std_f1=('weighted_f1', 'std'))
        .reset_index()
    )

    model_order = ["4o-mini", "4.1-nano", "3-haiku", "grok-3-mini", "ds-v3"]
    prompt_order = ['adversarial', 'cot', 'naive']

    plt.figure(figsize=(10, 6))
    ax = sns.barplot(
        data=summary_df,
        x='model',
        y='mean_f1',
        hue='prompt',
        order=model_order,
        hue_order=prompt_order,
        palette=['#FFC107', '#FF5722', '#F44336'],
        errorbar=None  # Desativa o erro padrÃ£o do seaborn
    )

    # Adiciona manualmente as barras de erro
    for i, row in summary_df.iterrows():
        x_pos = model_order.index(row['model']) + (prompt_order.index(row['prompt']) - 1) * 0.25
        ax.errorbar(
            x=x_pos,
            y=row['mean_f1'],
            yerr=row['std_f1'],
            fmt='none',
            c='black',
            capsize=5,
            linewidth=1
        )

    plt.title(title)
    plt.xlabel("Model")
    plt.ylabel("Weighted F1-score")
    plt.ylim(0, 1)
    plt.legend(title='Prompt')
    plt.tight_layout()

# ExecuÃ§Ã£o
model_col = 'model'
prompt_col = 'prompt_type'
answer_col = 'answer'
model_answer_col = 'model_answer'

f1_df = calculate_weighted_f1_by_model_and_prompt(
    df, model_col, prompt_col, answer_col, model_answer_col, n_iterations=1000
)

plot_weighted_f1_by_model_and_prompt(f1_df)
plt.savefig('figures/cmr_weighted_f1_scores.png', dpi=360)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.utils import resample

def calculate_weighted_f1_by_model_and_prompt(df, model_col, prompt_col, answer_col, model_answer_col, n_iterations=100):
    """
    Calculates weighted F1-score for each (model, prompt_type) combination using bootstrap sampling.
    Returns a DataFrame with columns: model, prompt, weighted_f1 (one per bootstrap).
    """
    results = []
    for (model, prompt), group in df.groupby([model_col, prompt_col]):
        for _ in range(n_iterations):
            sample = resample(group, replace=True)
            y_true = sample[answer_col].to_numpy()
            y_pred = sample[model_answer_col].to_numpy()
            valid = ~pd.isna(y_true) & ~pd.isna(y_pred)
            y_true = y_true[valid]
            y_pred = y_pred[valid]
            if len(y_true) > 0 and len(np.unique(y_true)) > 1:
                score = f1_score(y_true, y_pred, average='weighted')
                results.append({'model': model, 'prompt': prompt, 'weighted_f1': score})
    return pd.DataFrame(results)

def plot_weighted_f1_by_model_and_prompt(f1_df, title="Weighted F1-score by model and prompt type (a)"):
    """
    Plots a grouped barplot with error bars (std deviation) by model and prompt.
    Also prints bootstrap mean and std estimates as tables.
    """
    model_name_map = {
        "model_gpt_4o_mini": "4o-mini",
        "model_gpt_41_nano": "4.1-nano",
        "model_claude_35_haiku": "3-haiku",
        "model_grok_3_mini_beta": "grok-3-mini",
        "model_ds_v3": "ds-v3"
    }

    f1_df = f1_df.dropna()
    f1_df['model'] = f1_df['model'].map(model_name_map)

    summary_df = (
        f1_df.groupby(['model', 'prompt'])
        .agg(mean_f1=('weighted_f1', 'mean'), std_f1=('weighted_f1', 'std'))
        .reset_index()
    )

    model_order = ["4o-mini", "4.1-nano", "3-haiku", "grok-3-mini", "ds-v3"]
    prompt_order = ['adversarial', 'cot', 'naive']

    plt.figure(figsize=(10, 4))
    
    # First plot
    ax1 = plt.subplot(1, 2, 1)
    sns.barplot(
        data=summary_df,
        x='model',
        y='mean_f1',
        hue='prompt',
        order=model_order,
        hue_order=prompt_order,
        palette=['#FFC107', '#FF5722', '#F44336'],
        errorbar=None  # Disable standard error from seaborn
    )

    # Add error bars manually
    for i, row in summary_df.iterrows():
        x_pos = model_order.index(row['model']) + (prompt_order.index(row['prompt']) - 1) * 0.25
        ax1.errorbar(
            x=x_pos,
            y=row['mean_f1'],
            yerr=row['std_f1'],
            fmt='none',
            c='black',
            capsize=5,
            linewidth=1
        )

    ax1.set_title(title)
    ax1.set_xlabel("Model")
    ax1.set_ylabel("Weighted F1-score")
    ax1.set_ylim(0, 1)
    ax1.legend(title='Prompt')
    
    # Print bootstrap estimates table for first plot
    print("Bootstrap mean and std estimates for Model x Prompt (First Plot):")
    for model in model_order:
        for prompt in prompt_order:
            subset = f1_df[(f1_df['model'] == model) & (f1_df['prompt'] == prompt)]
            mean_estimate = subset['weighted_f1'].mean()
            std_estimate = subset['weighted_f1'].std()
            print(f"Model: {model}, Prompt: {prompt} -> Mean: {mean_estimate:.4f}, SD: {std_estimate:.4f}")

    # Second plot
    ax2 = plt.subplot(1, 2, 2)
    barplot = sns.barplot(
        data=summary_df,
        x='prompt',
        y='mean_f1',
        hue='model',
        order=prompt_order,
        palette='Blues',
        ax=ax2,
        errorbar=None
    )

    # Add error bars in correct position
    for bar, (_, row) in zip(barplot.patches, summary_df.iterrows()):
        x = bar.get_x() + bar.get_width() / 2
        y = bar.get_height()
        yerr = row['std_f1']
        ax2.errorbar(
            x=x,
            y=y,
            yerr=yerr,
            fmt='none',
            c='black',
            capsize=5,
            linewidth=1
        )

    ax2.set_title("Weighted F1-score by Prompt (b)")
    ax2.set_xlabel("Prompt")
    ax2.set_ylabel("Weighted F1-score")
    ax2.set_ylim(0, 1)
    ax2.legend(title='Model')

    # Print bootstrap estimates table for second plot
    print("Bootstrap mean and std estimates for Prompt x Model (Second Plot):")
    for prompt in prompt_order:
        for model in model_order:
            subset = f1_df[(f1_df['model'] == model) & (f1_df['prompt'] == prompt)]
            mean_estimate = subset['weighted_f1'].mean()
            std_estimate = subset['weighted_f1'].std()
            print(f"Prompt: {prompt}, Model: {model} -> Mean: {mean_estimate:.4f}, SD: {std_estimate:.4f}")

    plt.tight_layout()

# ExecuÃ§Ã£o
model_col = 'model'
prompt_col = 'prompt_type'
answer_col = 'answer'
model_answer_col = 'model_answer'

f1_df = calculate_weighted_f1_by_model_and_prompt(
    df, model_col, prompt_col, answer_col, model_answer_col, n_iterations=1000
)

plot_weighted_f1_by_model_and_prompt(f1_df)
plt.savefig('figures/cmr_weighted_f1_scores.png', dpi=360)
plt.show()
