In [None]:
import os
import time
import pandas as pd
import numpy as np
import torch
import torchaudio
from transformers import AutoFeatureExtractor, AutoModel

WINDOWS = True

# Define the list (or tuple) of prefixes to be excluded
exclude_prefixes = ("12HSC", "1HCB2", "1SNB2", "E1LY2", "I12ME", "IG1V2", "V1PR2", "OU12L", "1H2LJ")


# Base directories
if WINDOWS:
    INPUT_DIR = 'W:/Portrait/Embeddings/Portrait Transcripts'
else:
    # For linux
    INPUT_DIR = '/Volumes/mgialou/Portrait/Embeddings/Portrait Transcripts'

OUTPUT_BASE_DIR = os.path.join(INPUT_DIR, 'audio_embeddings')
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

MODEL_IDS       = [
    "openai/whisper-large-v3",
    "utter-project/mHuBERT-147",
    "facebook/mms-1b-all"
]
# try to use the richer "soundfile" backend first
try:
    torchaudio.set_audio_backend("soundfile")
    print("🔊 using torchaudio backend = soundfile")
except Exception:
    print("⚠️  couldn't switch to soundfile backend; staying with default")

# Original path prefix on Mac/Linux
original_prefix = '/Users/miltos/Desktop/ftp_portrait/PORTRAIT/'

# Windows-specific prefix (note the use of a raw string to handle backslashes)
windows_prefix = r'P:\1_ejecutando\IN PORTRAIT\INVESTIGACION\3. Experimental phase\Participantes\Participantes_datos\revisados\buenos\\'



In [None]:
def load_and_resample_audio(audio_file: str,
                            target_sample_rate: int = 16000) -> torch.Tensor:
    """
    Loads an audio file and resamples it to target_sample_rate.
    Falls back to soundfile.read if torchaudio.load fails.
    Returns a 1D Tensor [num_samples].
    """
    # strip file:// and whitespace
    if audio_file.startswith("file://"):
        audio_file = audio_file[len("file://"):]
    audio_file = audio_file.strip()

    if not os.path.isfile(audio_file):
        raise FileNotFoundError(f"Not found: {audio_file!r}")

    try:
        # try torchaudio first
        waveform, orig_sr = torchaudio.load(audio_file)  # -> Tensor[C,L]
        # if multichannel, pick first channel
        if waveform.ndim > 1:
            waveform = waveform[0:1, :]
    except Exception as e:
        # fallback to soundfile
        import soundfile as sf
        data, orig_sr = sf.read(audio_file)            # -> ndarray [L] or [L,C]
        if data.ndim > 1:
            data = data[:, 0]                          # take first channel
        waveform = torch.from_numpy(data).unsqueeze(0) # [1, L]

    # resample if needed
    if orig_sr != target_sample_rate:
        resampler = torchaudio.transforms.Resample(
            orig_freq=orig_sr, new_freq=target_sample_rate
        )
        waveform = resampler(waveform)

    # squeeze to 1D [L]
    return waveform.squeeze(0)

In [None]:
def compute_audio_embeddings(audio_tensor: torch.Tensor,
                             feature_extractor,
                             model,
                             device) -> tuple[np.ndarray, np.ndarray]:
    inputs = feature_extractor(audio_tensor,
                               sampling_rate=16000,
                               return_tensors="pt")
    if "input_features" in inputs:
        x = inputs["input_features"].to(device)
    elif "input_values" in inputs:
        x = inputs["input_values"].to(device)
    else:
        raise ValueError("No 'input_features' or 'input_values' in extractor output")

    with torch.no_grad():
        # Whisper‐style vs Wav2Vec2/HuBERT
        if hasattr(model, "encoder") and x.ndim == 3:
            h = model.encoder(x).last_hidden_state
        else:
            key = "input_features" if x.ndim == 3 else "input_values"
            out = model(**{ key: x })
            h = out.last_hidden_state

    h = h.cpu().numpy()  # [B, T, D]
    mean_emb = h.mean(axis=1).reshape(-1)
    max_emb  = h.max(axis=1).reshape(-1)
    return mean_emb, max_emb

In [None]:
def process_user_file(xlsx_path: str,
                      output_dir: str,
                      model_ids: list[str]):
    """
    For each row in xlsx_path:
      - read 'audio_filepath'
      - compute and save mean/max embeddings for each model
    """
    df = pd.read_excel(xlsx_path)

    # load device once
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # load all models+extractors
    models = {}
    for model_id in model_ids:
        print(f"  Loading {model_id} …")
        fe = AutoFeatureExtractor.from_pretrained(model_id)
        m  = AutoModel.from_pretrained(model_id).to(device).eval()
        name = model_id.split("/")[-1]
        models[name] = (fe, m)
        # per‐model folder
        os.makedirs(os.path.join(output_dir, name), exist_ok=True)

    # iterate rows
    for idx, row in df.iterrows():
        path = row.get("audio_filepath")
        # Check if running on Windows
        if WINDOWS:
            # Replace the prefix if the file path begins with the original prefix
            if path.startswith(original_prefix):
                path = path.replace(original_prefix, windows_prefix)

        if not isinstance(path, str) or not os.path.isfile(path):
            print(f"[row {idx}] skipping invalid path: {path!r}")
            continue

        try:
            wav = load_and_resample_audio(path)
        except Exception as e:
            print(f"[row {idx}] error loading {path}: {e}")
            continue

        base = os.path.splitext(os.path.basename(path))[0]
        for name, (fe, m) in models.items():
            mean_emb, max_emb = compute_audio_embeddings(wav, fe, m, device)
            out_dir = os.path.join(output_dir, name)
            np.save(os.path.join(out_dir, f"{base}-mean.npy"), mean_emb)
            np.save(os.path.join(out_dir, f"{base}-max.npy"), max_emb)
            print(f"[row {idx}] → {name}: saved {base}-{{mean,max}}.npy")


In [None]:
t0 = time.perf_counter()
for fn in os.listdir(INPUT_DIR):
    if not fn.endswith("_transcripts_Model.xlsx") or fn.startswith("."):
        continue
    # Skip files whose name starts with any of the excluded prefixes
    if fn.startswith(exclude_prefixes):
        continue
    user_id = fn.split("_")[0]
    in_path = os.path.join(INPUT_DIR, fn)
    out_dir = os.path.join(OUTPUT_BASE_DIR, user_id)
    os.makedirs(out_dir, exist_ok=True)
    print(f"\n== User {user_id} ({fn}) ==")
    process_user_file(in_path, out_dir, MODEL_IDS)

print(f"\nAll done in {(time.perf_counter()-t0)/60:.2f} min.")