In [2]:
import os, glob, math, numpy as np, torch
import torchaudio
from tqdm import tqdm
from transformers import AutoFeatureExtractor, HubertModel

In [3]:
AUDIO_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/MDVR-KCL_Dataset/26_29_09_2017_KCL/26-29_09_2017_KCL/ReadText/PD"     # folder with .wav/.flac/.mp3...
OUT_DIR     = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Hubert embeddings", "PD_ReadText_hubert_features")
MODEL_NAME  = "facebook/hubert-base-ls960"    # or "facebook/hubert-large-ll60k"
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TARGET_SR   = 16000
CHUNK_SEC   = 30.0        # process long files in ~30s chunks to avoid OOM
STRIDE_SEC  = 30.0        # == CHUNK_SEC -> no overlap; keep simple concatenation
SAVE_DTYPE  = np.float32  # set to np.float16 to halve disk space

AUDIO_EXTS  = (".wav", ".flac", ".mp3", ".m4a", ".ogg")

os.makedirs(OUT_DIR, exist_ok=True)

In [4]:
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = HubertModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in model.parameters(): p.requires_grad = False

def load_audio(path, target_sr=TARGET_SR):
    wav, sr = torchaudio.load(path)              # [C,T]
    if wav.size(0) > 1: wav = wav.mean(0, keepdim=True)  # mono
    if sr != target_sr: wav = torchaudio.functional.resample(wav, sr, target_sr)
    return wav.squeeze(0), target_sr             # [T], 16000

def iter_chunks(wav, sr, chunk_sec=CHUNK_SEC, stride_sec=STRIDE_SEC):
    T = wav.numel()
    if chunk_sec <= 0 or T <= int(chunk_sec*sr):
        yield wav; return
    L = int(chunk_sec*sr); S = int(stride_sec*sr)
    i = 0
    while i < T:
        j = min(i+L, T)
        yield wav[i:j]
        if j == T: break
        i += S

@torch.no_grad()
def hubert_embed_1d(wav_1d, sr=TARGET_SR):
    inputs = feature_extractor(wav_1d.numpy(), sampling_rate=sr, return_tensors="pt")
    out = model(**{k: v.to(DEVICE) for k, v in inputs.items()})
    feats = out.last_hidden_state[0]                 # [L, D]
    mask  = torch.ones(feats.size(0), dtype=torch.bool)
    return feats.cpu(), mask

def process_file(path):
    base = os.path.splitext(os.path.basename(path))[0]
    out_path = os.path.join(OUT_DIR, f"{base}_hubert_feats.npz")
    if os.path.isfile(out_path): return

    wav, sr = load_audio(path, TARGET_SR)
    feats_all, masks_all = [], []
    for chunk in iter_chunks(wav, sr):
        f, m = hubert_embed_1d(chunk, sr)
        feats_all.append(f); masks_all.append(m)

    feats = torch.cat(feats_all, 0).numpy().astype(SAVE_DTYPE)  # [L, D]
    mask  = torch.cat(masks_all, 0).numpy()                     # [L]
    dur   = wav.numel()/sr
    frame_hz = feats.shape[0]/max(dur, 1e-9)

    np.savez_compressed(
        out_path,
        hubert_embeddings=feats,
        attention_mask=mask,
        sample_rate=np.array(sr),
        model=np.array(MODEL_NAME),
        frame_hz=np.array(frame_hz, dtype=np.float32),
        audio_duration_sec=np.array(dur, dtype=np.float32),
        chunk_sec=np.array(CHUNK_SEC, dtype=np.float32),
        stride_sec=np.array(STRIDE_SEC, dtype=np.float32),
        source_audio=os.path.basename(path),
    )
    print(f"[ok] {os.path.basename(path)} -> {feats.shape}")

def main():
    files = [p for p in glob.glob(os.path.join(AUDIO_DIR, "**", "*"), recursive=True)
             if os.path.splitext(p)[1].lower() in AUDIO_EXTS]
    files.sort()
    print(f"Found {len(files)} audio files.")
    for p in tqdm(files, desc="HuBERT"):
        try: process_file(p)
        except Exception as e: print(f"[err] {os.path.basename(p)}: {e}")

if __name__ == "__main__":
    main()


Found 16 audio files.


HuBERT:   6%|▋         | 1/16 [00:02<00:31,  2.11s/it]

[ok] ID02_pd_2_0_0.wav -> (7872, 768)


HuBERT:  12%|█▎        | 2/16 [00:03<00:22,  1.61s/it]

[ok] ID04_pd_2_0_1.wav -> (6119, 768)


HuBERT:  19%|█▉        | 3/16 [00:05<00:22,  1.75s/it]

[ok] ID06_pd_3_1_1.wav -> (8940, 768)


HuBERT:  25%|██▌       | 4/16 [00:06<00:19,  1.66s/it]

[ok] ID07_pd_2_0_0.wav -> (7379, 768)


HuBERT:  31%|███▏      | 5/16 [00:07<00:15,  1.42s/it]

[ok] ID13_pd_3_2_2.wav -> (4664, 768)


HuBERT:  38%|███▊      | 6/16 [00:09<00:15,  1.55s/it]

[ok] ID16_pd_2_0_0.wav -> (8357, 768)


HuBERT:  44%|████▍     | 7/16 [00:10<00:12,  1.41s/it]

[ok] ID17_pd_2_1_0.wav -> (5467, 768)


HuBERT:  50%|█████     | 8/16 [00:11<00:09,  1.24s/it]

[ok] ID18_pd_4_3_3.wav -> (4284, 768)


HuBERT:  56%|█████▋    | 9/16 [00:13<00:09,  1.29s/it]

[ok] ID20_pd_3_0_1.wav -> (7049, 768)


HuBERT:  62%|██████▎   | 10/16 [00:14<00:08,  1.34s/it]

[ok] ID24_pd_2_0_0.wav -> (7092, 768)


HuBERT:  69%|██████▉   | 11/16 [00:15<00:05,  1.20s/it]

[ok] ID27_pd_4_1_1.wav -> (4186, 768)


HuBERT:  75%|███████▌  | 12/16 [00:16<00:05,  1.32s/it]

[ok] ID29_pd_3_1_2.wav -> (7758, 768)


HuBERT:  81%|████████▏ | 13/16 [00:18<00:03,  1.32s/it]

[ok] ID30_pd_2_1_1.wav -> (6608, 768)


HuBERT:  88%|████████▊ | 14/16 [00:19<00:02,  1.30s/it]

[ok] ID32_pd_3_1_1.wav -> (3649, 768)


HuBERT:  94%|█████████▍| 15/16 [00:24<00:02,  2.47s/it]

[ok] ID33_pd_3_2_2.wav -> (5915, 768)


HuBERT: 100%|██████████| 16/16 [00:25<00:00,  1.62s/it]

[ok] ID34_pd_2_0_0.wav -> (6391, 768)





In [8]:
AUDIO_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/MDVR-KCL_Dataset/26_29_09_2017_KCL/26-29_09_2017_KCL/ReadText/HC"     # folder with .wav/.flac/.mp3...
OUT_DIR     = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Hubert embeddings", "HC_ReadText_hubert_features")

os.makedirs(OUT_DIR)

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = HubertModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in model.parameters(): p.requires_grad = False

def load_audio(path, target_sr=TARGET_SR):
    wav, sr = torchaudio.load(path)              # [C,T]
    if wav.size(0) > 1: wav = wav.mean(0, keepdim=True)  # mono
    if sr != target_sr: wav = torchaudio.functional.resample(wav, sr, target_sr)
    return wav.squeeze(0), target_sr             # [T], 16000

def iter_chunks(wav, sr, chunk_sec=CHUNK_SEC, stride_sec=STRIDE_SEC):
    T = wav.numel()
    if chunk_sec <= 0 or T <= int(chunk_sec*sr):
        yield wav; return
    L = int(chunk_sec*sr); S = int(stride_sec*sr)
    i = 0
    while i < T:
        j = min(i+L, T)
        yield wav[i:j]
        if j == T: break
        i += S

@torch.no_grad()
def hubert_embed_1d(wav_1d, sr=TARGET_SR):
    inputs = feature_extractor(wav_1d.numpy(), sampling_rate=sr, return_tensors="pt")
    out = model(**{k: v.to(DEVICE) for k, v in inputs.items()})
    feats = out.last_hidden_state[0]                 # [L, D]
    mask  = torch.ones(feats.size(0), dtype=torch.bool)
    return feats.cpu(), mask

def process_file(path):
    base = os.path.splitext(os.path.basename(path))[0]
    out_path = os.path.join(OUT_DIR, f"{base}_hubert_feats.npz")
    if os.path.isfile(out_path): return

    wav, sr = load_audio(path, TARGET_SR)
    feats_all, masks_all = [], []
    for chunk in iter_chunks(wav, sr):
        f, m = hubert_embed_1d(chunk, sr)
        feats_all.append(f); masks_all.append(m)

    feats = torch.cat(feats_all, 0).numpy().astype(SAVE_DTYPE)  # [L, D]
    mask  = torch.cat(masks_all, 0).numpy()                     # [L]
    dur   = wav.numel()/sr
    frame_hz = feats.shape[0]/max(dur, 1e-9)

    np.savez_compressed(
        out_path,
        hubert_embeddings=feats,
        attention_mask=mask,
        sample_rate=np.array(sr),
        model=np.array(MODEL_NAME),
        frame_hz=np.array(frame_hz, dtype=np.float32),
        audio_duration_sec=np.array(dur, dtype=np.float32),
        chunk_sec=np.array(CHUNK_SEC, dtype=np.float32),
        stride_sec=np.array(STRIDE_SEC, dtype=np.float32),
        source_audio=os.path.basename(path),
    )
    print(f"[ok] {os.path.basename(path)} -> {feats.shape}")

def main():
    files = [p for p in glob.glob(os.path.join(AUDIO_DIR, "**", "*"), recursive=True)
             if os.path.splitext(p)[1].lower() in AUDIO_EXTS]
    files.sort()
    print(f"Found {len(files)} audio files.")
    for p in tqdm(files, desc="HuBERT"):
        try: process_file(p)
        except Exception as e: print(f"[err] {os.path.basename(p)}: {e}")

if __name__ == "__main__":
    main()


Found 21 audio files.


HuBERT:   5%|▍         | 1/21 [00:02<00:44,  2.22s/it]

[ok] ID00_hc_0_0_0.wav -> (7550, 768)


HuBERT:  10%|▉         | 2/21 [00:03<00:36,  1.90s/it]

[ok] ID01_hc_0_0_0.wav -> (8200, 768)


HuBERT:  14%|█▍        | 3/21 [00:05<00:29,  1.63s/it]

[ok] ID03_hc_0_0_0.wav -> (6973, 768)


HuBERT:  19%|█▉        | 4/21 [00:06<00:23,  1.40s/it]

[ok] ID05_hc_0_0_0.wav -> (5543, 768)


HuBERT:  24%|██▍       | 5/21 [00:07<00:22,  1.38s/it]

[ok] ID08_hc_0_0_0.wav -> (7300, 768)


HuBERT:  29%|██▊       | 6/21 [00:08<00:19,  1.31s/it]

[ok] ID09_hc_0_0_0.wav -> (6269, 768)


HuBERT:  33%|███▎      | 7/21 [00:10<00:18,  1.30s/it]

[ok] ID10_hc_0_0_0.wav -> (6834, 768)


HuBERT:  38%|███▊      | 8/21 [00:11<00:17,  1.32s/it]

[ok] ID11_hc_0_0_0.wav -> (7415, 768)


HuBERT:  43%|████▎     | 9/21 [00:12<00:15,  1.29s/it]

[ok] ID12_hc_0_0_0.wav -> (6522, 768)


HuBERT:  48%|████▊     | 10/21 [00:14<00:15,  1.44s/it]

[ok] ID14_hc_0_0_0.wav -> (8415, 768)


HuBERT:  52%|█████▏    | 11/21 [00:16<00:14,  1.49s/it]

[ok] ID15_hc_0_0_0.wav -> (8526, 768)


HuBERT:  57%|█████▋    | 12/21 [00:17<00:12,  1.44s/it]

[ok] ID19_hc_0_0_0.wav -> (7058, 768)


HuBERT:  62%|██████▏   | 13/21 [00:18<00:10,  1.35s/it]

[ok] ID21_hc_0_0_0.wav -> (6106, 768)


HuBERT:  67%|██████▋   | 14/21 [00:20<00:09,  1.41s/it]

[ok] ID22_hc_0_0_0.wav -> (8366, 768)


HuBERT:  71%|███████▏  | 15/21 [00:25<00:15,  2.65s/it]

[ok] ID23_hc_0_0_0.wav -> (10173, 768)


HuBERT:  76%|███████▌  | 16/21 [00:27<00:11,  2.35s/it]

[ok] ID25_hc_0_0_0.wav -> (8712, 768)


HuBERT:  81%|████████  | 17/21 [00:28<00:08,  2.07s/it]

[ok] ID26_hc_0_0_0.wav -> (7681, 768)


HuBERT:  86%|████████▌ | 18/21 [00:29<00:05,  1.86s/it]

[ok] ID28_hc_0_0_0.wav -> (7284, 768)


HuBERT:  90%|█████████ | 19/21 [00:31<00:03,  1.84s/it]

[ok] ID31_hc_0_1_1.wav -> (9642, 768)


HuBERT:  95%|█████████▌| 20/21 [00:32<00:01,  1.63s/it]

[ok] ID35_hc_0_0_0.wav -> (4658, 768)


HuBERT: 100%|██████████| 21/21 [00:34<00:00,  1.64s/it]

[ok] ID36_hc_0_0_0.wav -> (7924, 768)





In [5]:
AUDIO_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/MDVR-KCL_Dataset/26_29_09_2017_KCL/26-29_09_2017_KCL/SpontaneousDialogue/HC"     # folder with .wav/.flac/.mp3...
OUT_DIR     = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/HC_Spontaneous", "HC_Spontaneous_hubert_features")

os.makedirs(OUT_DIR)

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = HubertModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in model.parameters(): p.requires_grad = False

def load_audio(path, target_sr=TARGET_SR):
    wav, sr = torchaudio.load(path)              # [C,T]
    if wav.size(0) > 1: wav = wav.mean(0, keepdim=True)  # mono
    if sr != target_sr: wav = torchaudio.functional.resample(wav, sr, target_sr)
    return wav.squeeze(0), target_sr             # [T], 16000

def iter_chunks(wav, sr, chunk_sec=CHUNK_SEC, stride_sec=STRIDE_SEC):
    T = wav.numel()
    if chunk_sec <= 0 or T <= int(chunk_sec*sr):
        yield wav; return
    L = int(chunk_sec*sr); S = int(stride_sec*sr)
    i = 0
    while i < T:
        j = min(i+L, T)
        yield wav[i:j]
        if j == T: break
        i += S

@torch.no_grad()
def hubert_embed_1d(wav_1d, sr=TARGET_SR):
    inputs = feature_extractor(wav_1d.numpy(), sampling_rate=sr, return_tensors="pt")
    out = model(**{k: v.to(DEVICE) for k, v in inputs.items()})
    feats = out.last_hidden_state[0]                 # [L, D]
    mask  = torch.ones(feats.size(0), dtype=torch.bool)
    return feats.cpu(), mask

def process_file(path):
    base = os.path.splitext(os.path.basename(path))[0]
    out_path = os.path.join(OUT_DIR, f"{base}_hubert_feats.npz")
    if os.path.isfile(out_path): return

    wav, sr = load_audio(path, TARGET_SR)
    feats_all, masks_all = [], []
    for chunk in iter_chunks(wav, sr):
        f, m = hubert_embed_1d(chunk, sr)
        feats_all.append(f); masks_all.append(m)

    feats = torch.cat(feats_all, 0).numpy().astype(SAVE_DTYPE)  # [L, D]
    mask  = torch.cat(masks_all, 0).numpy()                     # [L]
    dur   = wav.numel()/sr
    frame_hz = feats.shape[0]/max(dur, 1e-9)

    np.savez_compressed(
        out_path,
        hubert_embeddings=feats,
        attention_mask=mask,
        sample_rate=np.array(sr),
        model=np.array(MODEL_NAME),
        frame_hz=np.array(frame_hz, dtype=np.float32),
        audio_duration_sec=np.array(dur, dtype=np.float32),
        chunk_sec=np.array(CHUNK_SEC, dtype=np.float32),
        stride_sec=np.array(STRIDE_SEC, dtype=np.float32),
        source_audio=os.path.basename(path),
    )
    print(f"[ok] {os.path.basename(path)} -> {feats.shape}")

def main():
    files = [p for p in glob.glob(os.path.join(AUDIO_DIR, "**", "*"), recursive=True)
             if os.path.splitext(p)[1].lower() in AUDIO_EXTS]
    files.sort()
    print(f"Found {len(files)} audio files.")
    for p in tqdm(files, desc="HuBERT"):
        try: process_file(p)
        except Exception as e: print(f"[err] {os.path.basename(p)}: {e}")

if __name__ == "__main__":
    main()


Found 21 audio files.


HuBERT:   5%|▍         | 1/21 [00:01<00:26,  1.34s/it]

[ok] ID00_hc_0_0_0s.wav -> (5988, 768)


HuBERT:  10%|▉         | 2/21 [00:02<00:22,  1.21s/it]

[ok] ID01_hc_0_0_0s.wav -> (5850, 768)


HuBERT:  14%|█▍        | 3/21 [00:03<00:23,  1.31s/it]

[ok] ID03_hc_0_0_0s.wav -> (7700, 768)


HuBERT:  19%|█▉        | 4/21 [00:05<00:25,  1.49s/it]

[ok] ID05_hc_0_0_0s.wav -> (9218, 768)


HuBERT:  24%|██▍       | 5/21 [00:06<00:22,  1.40s/it]

[ok] ID08_hc_0_0_0s.wav -> (6414, 768)


HuBERT:  29%|██▊       | 6/21 [00:08<00:20,  1.38s/it]

[ok] ID09_hc_0_0_0s.wav -> (7060, 768)


HuBERT:  33%|███▎      | 7/21 [00:09<00:20,  1.46s/it]

[ok] ID10_hc_0_0_0s.wav -> (8467, 768)


HuBERT:  38%|███▊      | 8/21 [00:11<00:20,  1.59s/it]

[ok] ID11_hc_0_0_0s.wav -> (8305, 768)


HuBERT:  43%|████▎     | 9/21 [00:13<00:19,  1.58s/it]

[ok] ID12_hc_0_0_0s.wav -> (8255, 768)


HuBERT:  48%|████▊     | 10/21 [00:14<00:17,  1.55s/it]

[ok] ID14_hc_0_0_0s.wav -> (6938, 768)


HuBERT:  52%|█████▏    | 11/21 [00:17<00:17,  1.78s/it]

[ok] ID15_hc_0_0_0s.wav -> (11024, 768)


HuBERT:  57%|█████▋    | 12/21 [00:21<00:23,  2.59s/it]

[ok] ID19_hc_0_0_0s.wav -> (4594, 768)


HuBERT:  62%|██████▏   | 13/21 [00:22<00:16,  2.06s/it]

[ok] ID21_hc_0_0_0s.wav -> (4452, 768)


HuBERT:  67%|██████▋   | 14/21 [00:23<00:13,  1.88s/it]

[ok] ID22hc_0_0_0s.wav -> (7506, 768)


HuBERT:  71%|███████▏  | 15/21 [00:25<00:10,  1.73s/it]

[ok] ID23_hc_0_0_0s.wav -> (7189, 768)


HuBERT:  76%|███████▌  | 16/21 [00:26<00:08,  1.62s/it]

[ok] ID25_hc_0_0_0s.wav -> (7083, 768)


HuBERT:  81%|████████  | 17/21 [00:27<00:06,  1.51s/it]

[ok] ID26_hc_0_0_0s.wav -> (6510, 768)


HuBERT:  86%|████████▌ | 18/21 [00:28<00:04,  1.40s/it]

[ok] ID28_hc_0_0_0s.wav -> (5812, 768)


HuBERT:  90%|█████████ | 19/21 [00:29<00:02,  1.21s/it]

[ok] ID31_hc_0_1_1s.wav -> (3815, 768)


HuBERT:  95%|█████████▌| 20/21 [00:31<00:01,  1.28s/it]

[ok] ID35_hc_0_0_0s.wav -> (7418, 768)


HuBERT: 100%|██████████| 21/21 [00:32<00:00,  1.54s/it]

[ok] ID36_hc_0_0_0s.wav -> (5665, 768)





In [6]:
AUDIO_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/MDVR-KCL_Dataset/26_29_09_2017_KCL/26-29_09_2017_KCL/SpontaneousDialogue/PD"     # folder with .wav/.flac/.mp3...
OUT_DIR     = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/PD_Spontaneous", "PD_Spontaneous_hubert_features")

os.makedirs(OUT_DIR)

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = HubertModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in model.parameters(): p.requires_grad = False

def load_audio(path, target_sr=TARGET_SR):
    wav, sr = torchaudio.load(path)              # [C,T]
    if wav.size(0) > 1: wav = wav.mean(0, keepdim=True)  # mono
    if sr != target_sr: wav = torchaudio.functional.resample(wav, sr, target_sr)
    return wav.squeeze(0), target_sr             # [T], 16000

def iter_chunks(wav, sr, chunk_sec=CHUNK_SEC, stride_sec=STRIDE_SEC):
    T = wav.numel()
    if chunk_sec <= 0 or T <= int(chunk_sec*sr):
        yield wav; return
    L = int(chunk_sec*sr); S = int(stride_sec*sr)
    i = 0
    while i < T:
        j = min(i+L, T)
        yield wav[i:j]
        if j == T: break
        i += S

@torch.no_grad()
def hubert_embed_1d(wav_1d, sr=TARGET_SR):
    inputs = feature_extractor(wav_1d.numpy(), sampling_rate=sr, return_tensors="pt")
    out = model(**{k: v.to(DEVICE) for k, v in inputs.items()})
    feats = out.last_hidden_state[0]                 # [L, D]
    mask  = torch.ones(feats.size(0), dtype=torch.bool)
    return feats.cpu(), mask

def process_file(path):
    base = os.path.splitext(os.path.basename(path))[0]
    out_path = os.path.join(OUT_DIR, f"{base}_hubert_feats.npz")
    if os.path.isfile(out_path): return

    wav, sr = load_audio(path, TARGET_SR)
    feats_all, masks_all = [], []
    for chunk in iter_chunks(wav, sr):
        f, m = hubert_embed_1d(chunk, sr)
        feats_all.append(f); masks_all.append(m)

    feats = torch.cat(feats_all, 0).numpy().astype(SAVE_DTYPE)  # [L, D]
    mask  = torch.cat(masks_all, 0).numpy()                     # [L]
    dur   = wav.numel()/sr
    frame_hz = feats.shape[0]/max(dur, 1e-9)

    np.savez_compressed(
        out_path,
        hubert_embeddings=feats,
        attention_mask=mask,
        sample_rate=np.array(sr),
        model=np.array(MODEL_NAME),
        frame_hz=np.array(frame_hz, dtype=np.float32),
        audio_duration_sec=np.array(dur, dtype=np.float32),
        chunk_sec=np.array(CHUNK_SEC, dtype=np.float32),
        stride_sec=np.array(STRIDE_SEC, dtype=np.float32),
        source_audio=os.path.basename(path),
    )
    print(f"[ok] {os.path.basename(path)} -> {feats.shape}")

def main():
    files = [p for p in glob.glob(os.path.join(AUDIO_DIR, "**", "*"), recursive=True)
             if os.path.splitext(p)[1].lower() in AUDIO_EXTS]
    files.sort()
    print(f"Found {len(files)} audio files.")
    for p in tqdm(files, desc="HuBERT"):
        try: process_file(p)
        except Exception as e: print(f"[err] {os.path.basename(p)}: {e}")

if __name__ == "__main__":
    main()


Found 15 audio files.


HuBERT:   7%|▋         | 1/15 [00:02<00:28,  2.01s/it]

[ok] ID02_pd_2_0_0s.wav -> (9462, 768)


HuBERT:  13%|█▎        | 2/15 [00:03<00:21,  1.65s/it]

[ok] ID04_pd_2_0_1s.wav -> (7340, 768)


HuBERT:  20%|██        | 3/15 [00:04<00:17,  1.45s/it]

[ok] ID06_pd_3_1_1s.wav -> (6411, 768)


HuBERT:  27%|██▋       | 4/15 [00:06<00:18,  1.67s/it]

[ok] ID07_pd_2_0_0s.wav -> (10457, 768)


HuBERT:  33%|███▎      | 5/15 [00:08<00:17,  1.75s/it]

[ok] ID13_pd_3_2_2s.wav -> (9966, 768)


HuBERT:  40%|████      | 6/15 [00:09<00:14,  1.66s/it]

[ok] ID16_pd_2_0_0a.wav -> (7824, 768)


HuBERT:  47%|████▋     | 7/15 [00:11<00:13,  1.68s/it]

[ok] ID17_pd_2_1_0s.wav -> (7712, 768)


HuBERT:  53%|█████▎    | 8/15 [00:12<00:10,  1.48s/it]

[ok] ID20_pd_3_0_1s.wav -> (5511, 768)


HuBERT:  60%|██████    | 9/15 [00:14<00:08,  1.44s/it]

[ok] ID24_pd_2_0_0s.wav -> (6974, 768)


HuBERT:  67%|██████▋   | 10/15 [00:15<00:07,  1.51s/it]

[ok] ID27_pd_4_1_1s.wav -> (8609, 768)


HuBERT:  73%|███████▎  | 11/15 [00:17<00:05,  1.48s/it]

[ok] ID29_pd_3_1_2s.wav -> (7033, 768)


HuBERT:  80%|████████  | 12/15 [00:18<00:04,  1.40s/it]

[ok] ID30_pd_2_1_1s.wav -> (6623, 768)


HuBERT:  87%|████████▋ | 13/15 [00:19<00:02,  1.27s/it]

[ok] ID32_pd_3_1_1s.wav -> (5265, 768)


HuBERT:  93%|█████████▎| 14/15 [00:24<00:02,  2.32s/it]

[ok] ID33_pd_3_2_2s.wav -> (6140, 768)


HuBERT: 100%|██████████| 15/15 [00:25<00:00,  1.69s/it]

[ok] ID34_pd_2_0_0s.wav -> (6092, 768)



