In [None]:
import glob
import os
import json
import pandas as pd
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    ConfusionMatrixDisplay, 
    f1_score,
    cohen_kappa_score
)
from sklearn.utils import resample
#from workflow import decision_logic
import matplotlib.pyplot as plt
import numpy as np

# transformers==4.50.0
# attention implementation = flash_attention_2

# microsoft/Phi-3.5-mini-instruct
# -------------------------------
#timestamp = "2025-04-29_19:09" # 1 example 
#timestamp = "2025-04-29_20:32" # 5 examples
#timestamp = "2025-05-05_12:30" # 10 examples

# microsoft/Phi-3-small-8k-instruct
# ---------------------------------
#timestamp = "2025-05-14_13:17" # 1 example
#timestamp = "2025-05-14_13:22" # 5 examples
#timestamp = "2025-05-14_13:24" # 10 examples

# allenai/Llama-3.1-Tulu-3.1-8B
# ------------------------------
#timestamp = "2025-05-14_15:30" # 1 example
#timestamp = "2025-05-15_18:42" # 5 examples
#timestamp = "2025-05-14_16:38" # 10 examples

# CohereForAI/c4ai-command-r7b-12-2024
# ------------------------------------
#timestamp = "2025-05-14_17:30" # 1 example
#timestamp = "2025-05-14_16:30" # 5 examples # average F1 across themes .81
#timestamp = "2025-05-14_19:27" # 10 examples

# google/gemma-2-9b-it
# --------------------
#timestamp = "2025-05-14_21:24" # 1 example
#timestamp = "2025-05-14_20:30" # 5 examples
#timestamp = "2025-05-15_00:07" # 10 examples

# CohereForAI for theme detection, and google gemma for target classification, with 5 examples
# DROP SPORTS!

timestamp = "2025-06-19_15:00" # 5 examples


verbose = True
bootstrap_flag = True

# load config, predictions and groundtruth
with open(os.path.join("results", timestamp, "config.json"), "r") as f:
    config = json.load(f)

cv_folds = os.path.join("results", timestamp, "cv_folds.json")
if os.path.exists(cv_folds):
    with open(cv_folds, "r") as f:
        cv_folds = json.load(f)
else:
    cv_folds = None

if verbose:
    # print main parameters
    print(f"Main parameters for experiment {timestamp}:")
    for key, value in config.items():
        print(f"   - {key}: {value}")
    print("\n")

groundtruth_df = pd.read_csv(config["groundtruth"], index_col="stimulus_id")

def bootstrap(y_true, y_pred, frac=0.25, iterations=10000, alpha=0.95):
    f1s = []
    cohen_kappas = []
    for _ in range(iterations):
        # resample the data
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred, n_samples=int(frac * len(y_true)), random_state=None)
        f1s.append(f1_score(y_true_resampled, y_pred_resampled, average='macro'))
        cohen_kappas.append(cohen_kappa_score(y_true_resampled, y_pred_resampled))

    # F1 confidence intervals
    p = ((1.0-alpha)/2.0) * 100
    lower = max(0.0, np.percentile(f1s, p))
    p = (alpha+((1.0-alpha)/2.0)) * 100
    upper = min(1.0, np.percentile(f1s, p))
    print(f" --- {alpha*100}% F1 confidence interval = [{lower:.2f}, {upper:.2f}]")

    # Cohen's kappa confidence intervals
    p = ((1.0-alpha)/2.0) * 100
    lower_kappa = max(0.0, np.percentile(cohen_kappas, p))
    p = (alpha+((1.0-alpha)/2.0)) * 100
    upper_kappa = min(1.0, np.percentile(cohen_kappas, p))
    print(f" --- {alpha*100}% Cohen's kappa confidence interval = [{lower_kappa:.2f}, {upper_kappa:.2f}]")



def final_cleanup(result):
    if "boys" in result.lower():
        return "Boys/men"
    if "girls" in result.lower():
        return "Girls/women"
    if "mixed" in result.lower():
        return "Mixed"
    else:
        return "placeholder"

def decision_logic(tasks_results):
    results = [tasks_results[task]["result"] for task in tasks_results.keys()]

    results = [x for x in results if isinstance(x, str)]
    results = [x for x in results if x != "placeholder"]

    if not results:
        return "placeholder"

    counts = {x: results.count(x) for x in set(results)}
    
    max_count = max(counts.values())
    winners = [k for k, v in counts.items() if v == max_count]

    if len(winners) == 1:
        result = winners[0]
        #return winners[0]
    else: # len(winners) > 1:
        # if there's a tie, return the result of the task containing the winners
        tie_task = [
            task for task in tasks_results.keys() 
            if set(tasks_results[task]["classes"]) == set(winners)
        ]
        if len(tie_task) == 1:
            result = tasks_results[tie_task[0]]["result"]
        else:
            # this should never happen, but just in case
            result = max(set(results), key=results.count)
    
    return final_cleanup(result)


In [None]:
prediction_files = glob.glob(f"results/{timestamp}/*_target_classification.json")

y_true = []
y_pred = []
y_pred_without_music_in_subtasks = []
y_pred_gmb = []

for file in prediction_files:
    stimulus_id = os.path.split(file)[1].split("_target_classification")[0]

    y_true.append(groundtruth_df.loc[stimulus_id, "target_of_toy_ad"])

    #with open(file, "r") as f:
    #    data = json.load(f)
    #    y_pred.append(data["target_class"])

    with open(f"results/{timestamp}/{stimulus_id}_tasks_results.json", "r") as f:
        tasks_results = json.load(f)

    y_pred.append(decision_logic(tasks_results))

    tasks_results.pop("music", None)
    y_pred_without_music_in_subtasks.append(decision_logic(tasks_results))
    y_pred_gmb.append(final_cleanup(tasks_results["G/M/B"]["result"]))

# remove parsing errors in main logic
parsing_errors_idx = [i for i, x in enumerate(y_pred) if x not in groundtruth_df.target_of_toy_ad.unique()] 
y_pred = [x for i, x in enumerate(y_pred) if i not in parsing_errors_idx]
y_true_without_parsing_errors = [x for i, x in enumerate(y_true) if i not in parsing_errors_idx]

print(f"There were {len(parsing_errors_idx)} parsing errors during target classification (main logic).")
for i in parsing_errors_idx:
    filemsg = f"   - {prediction_files[i].split('/')[-1]}"
    stimulus_id = os.path.split(prediction_files[i])[1].split("_target_classification")[0]
    # find which fold the stimulus belongs to
    if cv_folds is not None:
        for fold in range(len(cv_folds["test_original_ids"])):
            if stimulus_id in cv_folds["test_original_ids"][fold]:
                filemsg += f" in fold {fold} (0-ind)"
                break
        else:
            filemsg += f" in fold (under computation)"
    print(filemsg)
print("\n")

# remove parsing errors in secondary logics
parsing_errors_idx = [i for i, x in enumerate(y_pred_without_music_in_subtasks) if x not in groundtruth_df.target_of_toy_ad.unique()]
if parsing_errors_idx:
    print(f"There were {len(parsing_errors_idx)} parsing errors during target classification (without music in subtasks).")

y_pred_without_music_in_subtasks = [x for i, x in enumerate(y_pred_without_music_in_subtasks) if i not in parsing_errors_idx]
y_true_without_music_in_subtasks = [x for i, x in enumerate(y_true) if i not in parsing_errors_idx]


parsing_errors_idx = [i for i, x in enumerate(y_pred_gmb) if x not in groundtruth_df.target_of_toy_ad.unique()]
if parsing_errors_idx:
    print(f"There were {len(parsing_errors_idx)} parsing errors during target classification (G/M/B).")

y_pred_gmb = [x for i, x in enumerate(y_pred_gmb) if i not in parsing_errors_idx]
y_true_gmb = [x for i, x in enumerate(y_true) if i not in parsing_errors_idx]


print("Classification report (main logic):")
print(classification_report(y_true_without_parsing_errors, y_pred))


print(f"F1 (main logic): {f1_score(y_true_without_parsing_errors, y_pred, average='macro'):.2f}")
print(f"Cohen's kappa (main logic): {cohen_kappa_score(y_true_without_parsing_errors, y_pred):.2f}")
if bootstrap_flag:
    bootstrap(y_true_without_parsing_errors, y_pred, frac=0.25, iterations=10000, alpha=0.95)
print(f"\nF1 (without music in subtasks): {f1_score(y_true_without_music_in_subtasks, y_pred_without_music_in_subtasks, average='macro'):.2f}")
print(f"Cohen's kappa (without music in subtasks): {cohen_kappa_score(y_true_without_music_in_subtasks, y_pred_without_music_in_subtasks):.2f}")
if bootstrap_flag:
    bootstrap(y_true_without_music_in_subtasks, y_pred_without_music_in_subtasks, frac=0.25, iterations=10000, alpha=0.95)
print(f"\nF1 (G/M/B): {f1_score(y_true_gmb, y_pred_gmb, average='macro'):.2f}")
print(f"Cohen's kappa (G/M/B): {cohen_kappa_score(y_true_gmb, y_pred_gmb):.2f}")
if bootstrap_flag:
    bootstrap(y_true_gmb, y_pred_gmb, frac=0.25, iterations=10000, alpha=0.95)

In [None]:
cv_folds.keys()
music_only_f1s = []
for fold in range(len(cv_folds["test_original_ids"])):
    music_only_f1s.append(
        f1_score(
            cv_folds["actual"][fold],
            cv_folds["music_only_predictions"][fold],
            average="macro"
        )
    )

print("NB: this is a 10-fold cross-validation")
print(f"F1 (music only): {np.mean(music_only_f1s):.2f} (std: {np.std(music_only_f1s):.2f})\n")

In [None]:
# Having "Mixed" in the middle improves readability of the confusion matrix
target_names = ["Girls/women", "Mixed", "Boys/men"] 

# labels to indices
y_true_without_parsing_errors = [target_names.index(x) for x in y_true_without_parsing_errors]
y_pred = [target_names.index(x) for x in y_pred]

cm = confusion_matrix(y_true_without_parsing_errors, y_pred)

# normalize column-wise (precision estimate in the diagonal)
cm_col = cm.astype('float') / cm.sum(axis=0)[np.newaxis, :]
cm_col = np.round(cm_col, 2)

# normalize row-wise (recall in the diagonal)
cm_row = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm_row = np.round(cm_row, 2)

# plot the confusion matrix
for cm, title in zip([cm_col, cm_row], ["Precision in the diagonal", "Recall in the diagonal"]):
    # Plot confusion matrix with labels
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    disp.plot()
    # add title
    plt.title(title)

In [5]:
from itertools import permutations
from pprint import pprint

print_missclassified = False

if print_missclassified:
    for y_true_label, y_pred_label in list(permutations([0, 1, 2], 2)): # all non-diagonal cells
        # get the indices of the misclassified samples
        idxs = [i for i, (a, b) in enumerate(zip(y_true, y_pred)) if a == y_true_label and b == y_pred_label]
        print(f"\033[1m\033[31m| True: {target_names[y_true_label]} | Pred: {target_names[y_pred_label]} |\033[0m")
        for idx in idxs:
            with open(prediction_files[idx], "r") as f:
                state = json.load(f)

            stimulus_id = os.path.split(prediction_files[idx])[1].split("_target_classification")[0]
            fp = f"results/{timestamp}/{stimulus_id}_tasks_results.json"
            with open(fp, "r") as f:
                tasks_results = json.load(f)

            tasks_results = {key: tasks_results[key]["result"] for key in tasks_results}

            collected_themes = []
            for theme in state["collected_cues"].keys():
                if len(state["collected_cues"][theme]["cues"]) > 0:
                    collected_themes.append({
                        theme+"_cues": state["collected_cues"][theme]["cues"]
                        }
                    )
            
            print(f"- {prediction_files[idx].split('/')[-1]}")
            disp = {
                "----------": {
                    "transcript": state["current_transcript"],
                    "collected_themes": collected_themes,
                    "tasks_results": tasks_results,
                },
            }

            pprint(disp, indent=1, width=100, depth=None, sort_dicts=True)

        print("\n")

In [6]:
themes_preds = {
    theme : {
        "y_true":[],
        "y_pred":[],
    } 
    for theme in config["themes_definitions"].keys()
}
for file in prediction_files:
    with open(file, "r") as f:
        data = json.load(f)
    stimulus_id = os.path.split(file)[1].split("_target_classification")[0]

    for theme in config["themes_definitions"].keys():
        collected_cues = data["collected_cues"][theme]
        true_cues = groundtruth_df.loc[stimulus_id, theme+"_cues"]
        theme_true = 0 if pd.isna(true_cues) else 1
        theme_pred = 1 if collected_cues["cues"] else 0

        themes_preds[theme]["y_true"].append(theme_true)
        themes_preds[theme]["y_pred"].append(theme_pred)




In [None]:
f1s = []
cohen_kappas = []
for theme in themes_preds.keys():
    print(f"Theme {theme}:")
    f1 = f1_score(themes_preds[theme]["y_true"], themes_preds[theme]["y_pred"], average="macro")
    f1s.append(f1)
    print(f"F1: {f1:.2f}")

    cohen_kappa = cohen_kappa_score(themes_preds[theme]["y_true"], themes_preds[theme]["y_pred"])
    cohen_kappas.append(cohen_kappa)
    print(f"Cohen's kappa: {cohen_kappa:.2f}")

    if bootstrap_flag:
        bootstrap(themes_preds[theme]["y_true"], themes_preds[theme]["y_pred"], frac=0.25, iterations=10000, alpha=0.95)

    print("\n")

print(f"Average F1 across themes: {np.mean(f1s):.2f} ± {np.std(f1s):.2f}")
print(f"Average Cohen's kappa across themes: {np.mean(cohen_kappas):.2f} ± {np.std(cohen_kappas):.2f}\n")