In [None]:
import os
import glob
import numpy as np
from sklearn.decomposition import PCA
from umap.umap_ import UMAP
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


# -----------------------------------------------------------------------------
# SETTINGS — tweak these to match your setup
# -----------------------------------------------------------------------------

AUDIO_EMBEDDINGS = False

if AUDIO_EMBEDDINGS:
    EMB_ROOT = '/Volumes/mgialou/Portrait/Embeddings/Portrait Transcripts/audio_embeddings'
    MEAN = '-mean'
    # Audio models
    MODEL_IDS = [
        "mHuBERT-147",
        "whisper-large-v3",
        "mms-1b-all",
    ]
else:
    # Root folder where your embeddings live:
    #EMB_ROOT = r'W:/Portrait/Embeddings/Portrait Transcripts/embeddings'
    EMB_ROOT = '/Volumes/mgialou/Portrait/Embeddings/Portrait Transcripts/embeddings'
    MEAN = ''
    # Your model sub-folder names under each user folder:
    MODEL_IDS = [
        "Qwen3-Embedding-0.6B",
        "roberta-large",
        "bertin-roberta-base-spanish",
        "bert-base-spanish-wwm-cased",
        "bert-base-multilingual-cased",
    ]


# After per-model PCA, how many dims to reduce to before UMAP:
TARGET_PCA_DIMS = 50

# UMAP output dims (almost always 2 for plotting):
UMAP_DIMS = 2
# -----------------------------------------------------------------------------

all_reduced = []
meta        = []

# 1) For each model, load its embeddings, PCA-reduce to TARGET_PCA_DIMS
for model_id in MODEL_IDS:
    # 1) load embeddings + question labels
    pattern = os.path.join(EMB_ROOT, '*', model_id, '*' + MEAN + '.npy')
    paths = glob.glob(pattern)
    embs = []
    questions = []
    for p in paths:
        arr = np.load(p)
        if arr.size == 0:
            continue
        embs.append(arr)
        # assume filenames like "GR_Survey_q2.npy" → question="q2"
        # full questionnaire_question label
        stem = os.path.basename(p).replace('.npy','')  # <-- full questionnaire_question
        if AUDIO_EMBEDDINGS:
            stem = '_'.join(stem.split("_")[1:4])
        questions.append(stem)
    if not embs:
        print(f"[{model_id}] no embeddings found, skipping.")
        continue

    X = np.vstack(embs)  # shape (n_samples, orig_dim)
    print(f"[{model_id}] {X.shape[0]} samples, dim={X.shape[1]}")

    # 2) PCA → reduce to TARGET_PCA_DIMS
    pca = PCA(n_components=TARGET_PCA_DIMS)
    Xp  = pca.fit_transform(X)

    # 3) UMAP → 2D projection
    umap_proj = UMAP(n_components=UMAP_DIMS)
    X2        = umap_proj.fit_transform(Xp)

    # 4) Plot
    unique_qs = sorted(set(questions))
    cmap      = plt.cm.get_cmap('tab10')
    colors = [cmap(i/(len(unique_qs)-1)) for i in range(len(unique_qs))]
    color_map = dict(zip(unique_qs, colors))

    plt.figure(figsize=(10,10))
    for (x,y), label in zip(X2, questions):
        plt.scatter(x, y, color=color_map[label], s=20, alpha=0.8)

    handles = [
        Line2D([0],[0], marker='o', color=color_map[q], linestyle='', label=q)
        for q in unique_qs
    ]
    plt.legend(handles=handles, title="Questionnaire_Question", bbox_to_anchor=(1,1))
    plt.title(f"UMAP of `{model_id}` embeddings")
    plt.xlabel("UMAP 1")
    plt.ylabel("UMAP 2")
    plt.tight_layout()
    plt.show()