In [2]:
# check_flacs.py
import os
import numpy as np

# попытка читать через soundfile (самый надёжный для flac)
import soundfile as sf

# если хочешь свериться также с torchaudio — раскомментируй:
# import torchaudio
# torchaudio.set_audio_backend("soundfile")

def inspect_audio(path, min_sec=0.05):
    """
    Возвращает dict с полями:
      ok: bool
      issues: list[str]
      sr: int
      dur: float (сек)
    """
    issues = []
    try:
        info = sf.info(path)
        sr = int(info.samplerate)
        frames = int(info.frames)
        if frames <= 0:
            issues.append("no_frames")
    except Exception as e:
        return {"path": path, "ok": False, "issues": [f"sf.info_failed: {e}"]}

    try:
        data, sr2 = sf.read(path, dtype="float32", always_2d=True)
        if sr2 != sr:
            sr = sr2  # доверимся фактическому чтению
    except Exception as e:
        return {"path": path, "ok": False, "issues": [f"sf.read_failed: {e}"]}

    # берём первый канал
    x = data[:, 0]

    # базовые проверки
    if x.size == 0:
        issues.append("empty")
    if not np.isfinite(x).all():
        issues.append("nan_or_inf")
    peak = float(np.max(np.abs(x))) if x.size > 0 else 0.0
    if peak == 0.0:
        issues.append("all_zeros")
    if peak > 1.01:
        issues.append(f"clipping_peak={peak:.3f}")

    dur = (len(x) / sr) if sr > 0 else 0.0
    if dur < min_sec:
        issues.append(f"too_short_{dur:.3f}s")

    return {
        "path": path,
        "ok": len(issues) == 0,
        "issues": issues,
        "sr": sr,
        "dur": dur,
        "frames": len(x),
        "peak": peak,
    }

def check_many(paths):
    bad = []
    for i, p in enumerate(paths):
        res = inspect_audio(p)
        if not res["ok"]:
            bad.append(res)
            print(f"[BAD] {i}: {os.path.basename(p)} -> {res['issues']}")
        else:
            print(f"[OK ] {i}: {os.path.basename(p)} sr={res['sr']} dur={res['dur']:.2f}s peak={res['peak']:.3f}")
    print("\nSummary:", f"{len(bad)} / {len(paths)} problematic")
    return bad

# ==== вставь сюда свои пути ====
paths = [
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/3536/8226/3536-8226-0014.flac",
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/3536/8226/3536-8226-0027.flac",
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/777/126732/777-126732-0078.flac",
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/6241/61946/6241-61946-0019.flac",
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/5895/34615/5895-34615-0018.flac",
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/3752/4943/3752-4943-0017.flac",
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/1272/135031/1272-135031-0016.flac",
    "/Users/mabondy/Documents/dla_asr_hw1/data/datasets/librispeech/dev-clean/6345/64257/6345-64257-0015.flac",
]
# ===============================

if __name__ == "__main__":
    bad = check_many(paths)
    # если нужно — сохраним список проблемных
    if bad:
        print("\nProblematic files:")
        for r in bad:
            print(f"{r['path']} :: {', '.join(r['issues'])}")


[OK ] 0: 3536-8226-0014.flac sr=16000 dur=2.27s peak=0.326
[OK ] 1: 3536-8226-0027.flac sr=16000 dur=2.27s peak=0.291
[OK ] 2: 777-126732-0078.flac sr=16000 dur=2.27s peak=0.328
[OK ] 3: 6241-61946-0019.flac sr=16000 dur=2.27s peak=0.622
[OK ] 4: 5895-34615-0018.flac sr=16000 dur=2.27s peak=0.493
[OK ] 5: 3752-4943-0017.flac sr=16000 dur=2.27s peak=0.688
[OK ] 6: 1272-135031-0016.flac sr=16000 dur=2.28s peak=0.539
[OK ] 7: 6345-64257-0015.flac sr=16000 dur=2.29s peak=0.352

Summary: 0 / 8 problematic
