In [None]:
import glob
import math
import numpy as np
import matplotlib.pyplot as plt

In [None]:
paths = glob.glob("../results/huggingface/*.npz")

results = {}
max_val = 0
for path in paths:
    attn_dict = np.load(path, allow_pickle=True)

    model_name = str(attn_dict["model_path"])

    attn = attn_dict["visual_attention_weight_sums"]
    mean_attn = attn.mean(axis=0)
    std_attn = attn.std(axis=0)

    results[model_name] = {
        "mean": mean_attn,
        "std": std_attn,
    }

    if max_val < mean_attn.max():
        max_val = mean_attn.max()

In [None]:
n = 2
m = math.ceil(len(results) / n)

fig = plt.figure(figsize=(10 * n, 4 * m))

for i, (model_name, attn_dict) in enumerate(results.items()):
    plt.subplot(m, n, i + 1)

    mean_attn = attn_dict["mean"]
    std_attn = attn_dict["std"]

    plt.bar(range(len(mean_attn)), mean_attn, yerr=std_attn, capsize=5)
    plt.xlabel("Layer Index")
    plt.ylabel("Attention Weight")
    plt.title(str(model_name))
    plt.xticks(range(len(mean_attn)))
    plt.ylim(0, max_val * 1.1)
    plt.grid(axis="y", linestyle="--", alpha=0.7)

fig.suptitle(
    "Visual Attention Overlap with Vision Tokens",
    fontsize=16,
    # fontweight="bold"
)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
