## Metrics plots generator

This notebook is used to generate plots to show the performance of methods.


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
os.chdir("..")
np.set_printoptions(precision=2)

In [None]:
#SETUP
parsed_annotation = "final_annotation.json" # relative path to the parsed annotation file


In [None]:
with open(parsed_annotation, "r", encoding="utf8") as f:
    df = pd.read_json(f)

In [None]:

# Creates two additional columns in the df for future computation
df.loc[df["prompt"] != -1, "min_age"] = df.loc[df["prompt"] != -1, "year_of_photo_int"] - df.loc[df["prompt"] != -1, "birth_year"]
df.loc[df["prompt"] != -1, "max_age"] = df.loc[df["prompt"] != -1, "year_of_photo_int"] - df.loc[df["prompt"] != -1, "birth_year"]

df.loc[df["prompt"] == -1, "min_age"] = df.loc[df["prompt"] == -1, "left_photo_year"] - df.loc[df["prompt"] == -1, "birth_year"]
df.loc[df["prompt"] == -1, "max_age"] = df.loc[df["prompt"] == -1, "right_photo_year"] - df.loc[df["prompt"] == -1, "birth_year"]


In [None]:
df.head()

In [None]:
months = ["January",
          "February",
          "March",
          "April",
          "May",
          "June",
          "July",
          "August",
          "Sepember",
          "October",
          "November",
          "December"]
prompts = [f"{i}" for i in range(6)]
models = ["deepseek-r1",
          "deepseek-r1-llama-8B",
          "deepseek-r1-qwen-7B",
          "deepseek-r1-qwen-14B",
          "deepseek-r1-qwen-32B",
          "llama3-3"]

In [None]:
def make_heatmap(df_matrix, title, show = True):
    plt.figure(figsize=(8, 6))
    sns.heatmap(df_matrix, annot=True, fmt=".2f", cmap='viridis')
    plt.title(title)
    plt.ylabel("Model")
    plt.xlabel("Prompt")
    plt.tight_layout()
    if show:
        plt.show()

def make_comparisson_heatmap(data1 : pd.DataFrame, data2 : pd.DataFrame, title1 : str, title2 : str):
    vmin = min(data1.values.min(), data2.values.min())
    vmax = max(data1.values.max(), data2.values.max())
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    sns.heatmap(data1, ax = axes[0], annot=True, fmt=".2f", cmap='viridis', cbar=False, vmin = vmin, vmax = vmax)
    axes[0].set_title(title1)
    axes[0].set_ylabel("Model")
    axes[0].set_xlabel("Prompt")
    axes[0].set_aspect('equal')

    sns.heatmap((data2).to_numpy(), ax = axes[1], annot=True, fmt=".2f", cmap='viridis', vmin = vmin, vmax = vmax)
    axes[1].set_title(title2)
    axes[1].set_xlabel("Prompt")
    axes[1].set_aspect('equal')
    axes[1].tick_params(left=False, labelleft=False)
    plt.tight_layout()
    plt.show()

In [None]:
def get_prompt_groups(datf):
    return [datf[datf["prompt"] == i] for i in range(6)]

def get_ground_truth(datf):
    return datf[datf["prompt"] == -1]

def get_human_paths(datf):
    return datf[datf["prompt"] == -1]["path"].unique()

## Model coverage

In [None]:
%matplotlib qt

In [None]:

conf_mat = np.zeros((6,6))
for i in range(6):
    for j in range(6):
        conf_mat[i, j] = df[(df["model"] == models[i]) & (df["prompt"] == j)]["can_determine"].mean()


coverage = pd.DataFrame(conf_mat, index=models, columns=prompts)

make_heatmap(coverage, "coverage")
print(f"Model-wise confidence: {conf_mat.mean(axis=1)}")
print(f"Prompt-wise confidence: {conf_mat.mean(axis=0)}")


## Accuracy testing

In [None]:
df_gt = df[df["prompt"] == -1]["path"].unique()
prompt_groups = [df[df["prompt"] == i] for i in range(6)]
prompt_h = df[df["prompt"] == -1]

birth_year_acc_matrix = np.zeros((6,6))

for j, pg in enumerate(prompt_groups):
    model_groups = [pg[pg["model"] == models[m]] for m in range(6)]
    for i, mod in enumerate(model_groups):
        for _, row in mod.iterrows():
            if row["path"] in df_gt:
                try:
                    birth_year_acc_matrix[i, j] += int(row["birth_year"] == prompt_h.loc[prompt_h["path"] == row["path"]]["birth_year"])
                except:
                    continue
birth_year_accuracy = pd.DataFrame(birth_year_acc_matrix/len(df_gt), index=models, columns=prompts)
make_heatmap(birth_year_accuracy, "Birth year accuracy")

## Photo year accuracy

In [None]:
df_gt = df[df["prompt"] == -1]["path"].unique()
prompt_groups = [df[df["prompt"] == i] for i in range(6)]
prompt_h = df[df["prompt"] == -1]
photo_year_acc_matrix = np.zeros((6,6))

for j, pg in enumerate(prompt_groups):
    model_groups = [pg[pg["model"] == models[m]] for m in range(6)]
    for i, mod in enumerate(model_groups):
        for _, row in mod.iterrows():
            if row["path"] in df_gt:
                try:
                    prompt_row = prompt_h.loc[prompt_h["path"] == row["path"]]

                    if prompt_row.empty:
                        continue

                    left_photo_year = prompt_row["left_photo_year"].iloc[0]
                    right_photo_year = prompt_row["right_photo_year"].iloc[0]

                    if pd.isna(left_photo_year) or pd.isna(right_photo_year) or left_photo_year is None or right_photo_year is None:
                        if row["year_of_photo_int"] is None or pd.isna(row["year_of_photo_int"]):
                            photo_year_acc_matrix[i, j] += 1
                    else:
                        if row["year_of_photo_int"] is not None and not pd.isna(row["year_of_photo_int"]):
                            if row["year_of_photo_int"] in range(int(left_photo_year), int(right_photo_year) + 1):
                                photo_year_acc_matrix[i, j] += 1
                except Exception as e:
                    print(e)

                    continue


photo_year_accuracy = pd.DataFrame(photo_year_acc_matrix/len(df_gt), index=models, columns=prompts)

make_heatmap(photo_year_accuracy, "Photo year acuracy")


## Age accuracy - all samples

In [None]:
def is_range_within(inner_start, inner_end, outer_start, outer_end):
    if((inner_start is None or pd.isna(inner_start) or
       inner_end   is None  or pd.isna(inner_end)) and (
       outer_start is None  or pd.isna(outer_start) or
       outer_end   is None  or pd.isna(outer_end))):
        return 1
    elif (inner_start is None or pd.isna(inner_start) or
       inner_end   is None    or pd.isna(inner_end) or 
       outer_start is None    or pd.isna(outer_start) or
       outer_end   is None    or pd.isna(outer_end)):
        return 0
    else:
        return outer_start <= inner_start and inner_end <= outer_end


        
df_gt = df[df["prompt"] == -1]["path"].unique()
prompt_groups = [df[df["prompt"] == i] for i in range(6)]
prompt_h = df[df["prompt"] == -1]
age_acc_matrix = np.zeros((6,6))

for j, pg in enumerate(prompt_groups):
    model_groups = [pg[pg["model"] == models[m]] for m in range(6)]
    for i, mod in enumerate(model_groups):
        print(j, i)
        for _, row in mod.iterrows():
            if row["path"] in df_gt:
                try:
                    prompt_row = prompt_h.loc[prompt_h["path"] == row["path"]]

                    if prompt_row.empty:
                        continue

                    left_age = prompt_row["min_age"].iloc[0]
                    right_age = prompt_row["max_age"].iloc[0]

                    age_acc_matrix[i, j] += is_range_within(row["min_age"], row["max_age"], left_age, right_age)

                except Exception as e:
                    print(e)

                    continue


age_accuracy = pd.DataFrame(age_acc_matrix/len(df_gt), index=models, columns=prompts)

make_heatmap(age_accuracy, "Age accuracy - all samples")

## Age accuracy - only confident

In [None]:
conf_df = df[df["can_determine"] == True]
df_gt = get_human_paths(conf_df)
prompt_groups = get_prompt_groups(conf_df)
prompt_h = get_ground_truth(conf_df)
age_acc_conf_matrix = np.zeros((6,6))
count_mat = np.zeros((6,6))
for j, pg in enumerate(prompt_groups):
    model_groups = [pg[pg["model"] == models[m]] for m in range(6)]
    for i, mod in enumerate(model_groups):
        print(j, i)
        for _, row in mod.iterrows():
            if row["path"] in df_gt:
                count_mat[i, j] += 1
                try:
                    prompt_row = prompt_h.loc[prompt_h["path"] == row["path"]]

                    if prompt_row.empty:
                        continue

                    left_age = prompt_row["min_age"].iloc[0]
                    right_age = prompt_row["max_age"].iloc[0]

                    age_acc_conf_matrix[i, j] += is_range_within(row["min_age"], row["max_age"], left_age, right_age)

                except Exception as e:
                    print(e)

                    continue

conf_mat_acc = age_acc_conf_matrix/count_mat

age_acc_conf = pd.DataFrame(conf_mat_acc, index=models, columns=prompts)

make_heatmap(age_acc_conf, "Age accuracy - only confident")

## Age accuracy - only confident, determinable

In [None]:
conf_df = df[(df["can_determine"] == True) & (~pd.isna(df["min_age"]))]
df_gt = get_human_paths(conf_df)
prompt_groups = get_prompt_groups(conf_df)
prompt_h = get_ground_truth(conf_df)
age_acc_conf_matrix_no_nan = np.zeros((6,6))
count_mat = np.zeros((6,6))
for j, pg in enumerate(prompt_groups):
    model_groups = [pg[pg["model"] == models[m]] for m in range(6)]
    for i, mod in enumerate(model_groups):
        #print(j, i)
        for _, row in mod.iterrows():
            if row["path"] in df_gt:
                count_mat[i, j] += 1
                try:
                    prompt_row = prompt_h.loc[prompt_h["path"] == row["path"]]

                    if prompt_row.empty:
                        continue

                    left_age = prompt_row["min_age"].iloc[0]
                    right_age = prompt_row["max_age"].iloc[0]
                    right = is_range_within(row["min_age"], row["max_age"], left_age, right_age)
                    age_acc_conf_matrix_no_nan[i, j] += right
                except Exception as e:
                    print(e)

                    continue

conf_mat_acc = age_acc_conf_matrix_no_nan/count_mat

age_acc_conf_no_nan = pd.DataFrame(conf_mat_acc, index=models, columns=prompts)

make_heatmap(age_acc_conf_no_nan, "Age accuracy - only confident, determinable")

## Comparisson heatmaps

In [None]:
make_comparisson_heatmap(age_acc_conf, age_acc_conf_no_nan, "Age accuracy - confident", "Age accuracy - confident, determinable")

In [None]:
make_comparisson_heatmap(coverage*age_acc_conf, coverage*age_acc_conf_no_nan, "Effective accuracy - confident", "Effective accuracy - confident and determinable")

