In [None]:
import os
import numpy as np
import re
from pathlib import Path
from collections import defaultdict

WINDOWS = os.name == 'nt'
INPUT_DIR = Path('W:/Portrait/Embeddings/Portrait Transcripts' if WINDOWS
                 else '/Volumes/mgialou/Portrait/Embeddings/Portrait Transcripts')
OUTPUT_DIR = INPUT_DIR / 'embeddings'
print(f"Scanning for embeddings in: {OUTPUT_DIR}")

# data holders
all_emb = defaultdict(lambda: defaultdict(list))
empty_paths = defaultdict(lambda: defaultdict(list))
RX = re.compile(r'^(?:\d+_)?(.+?)(?:_EMPTY)?$')

for npy in OUTPUT_DIR.rglob('*.npy'):
    model_id = npy.parent.name
    stem = npy.stem
    qname = RX.match(stem).group(1)
    if stem.endswith('_EMPTY'):
        # record for later imputation
        empty_paths[model_id][qname].append(npy)
    else:
        try:
            arr = np.load(npy)
            if arr.size:
                all_emb[model_id][qname].append(arr.ravel())
        except Exception as e:
            print(f"Warning loading {npy}: {e}")

# report empties
total_empty = sum(len(v2) for v1 in empty_paths.values() for v2 in v1.values())
print(f"\nFound {total_empty} EMPTY.npy files\n")

# impute
imputed = 0
for model, questions in empty_paths.items():
    for qname, paths in questions.items():
        emb_list = all_emb[model].get(qname)
        if emb_list:
            mean_vec = np.mean(emb_list, axis=0)
            for p in paths:
                try:
                    np.save(p, mean_vec)
                    imputed += 1
                    print(f"Imputed {p}")
                except Exception as e:
                    print(f"Error saving {p}: {e}")
        else:
            print(f"No non-empty embeddings for {model}/{qname}, skipping {len(paths)} files")

print(f"\nImputation complete: {imputed} files overwritten.")