# Video Audio Alignment Toolkit Demo

Run this end-to-end pipeline on Google Colab to align speech from a video with extracted frames and export subtitle files.


## 1. Setup Dependencies
Install Python packages required for the toolkit.


In [None]:
%pip install -q openai-whisper ffmpeg-python Pillow numpy moviepy


In [None]:
import shutil
import subprocess

if shutil.which("ffmpeg") is None:
    print("Installing ffmpeg...")
    subprocess.run(["apt-get", "update", "-qq"], check=True)
    subprocess.run(["apt-get", "install", "-y", "ffmpeg"], check=True)
else:
    subprocess.run(["ffmpeg", "-version"], check=True)


## 2. Upload a Video File
Upload a short video clip (MP4 recommended).


In [None]:
from pathlib import Path
from google.colab import files

uploads = files.upload()
if not uploads:
    raise RuntimeError("No file uploaded. Please upload a video file to continue.")

VIDEO_PATH = Path(next(iter(uploads)))
print(f"Using video: {VIDEO_PATH}")


## 3. Run the Alignment Pipeline
This cell extracts audio, transcribes speech with Whisper, extracts frames, and builds the alignment JSON and SRT files.


In [None]:
from pathlib import Path
import sys

PROJECT_ROOT = Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from align import process_video
from to_srt import alignment_to_srt

OUTPUT_DIR = Path("outputs") / VIDEO_PATH.stem
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

alignment = process_video(
    VIDEO_PATH,
    OUTPUT_DIR,
    model="small",
    fps=1.0,
)

alignment_path = OUTPUT_DIR / "alignment.json"
srt_path = OUTPUT_DIR / "captions.srt"
alignment_to_srt(alignment_path, srt_path)

print(f"Alignment entries: {len(alignment)}")
print(f"Alignment JSON saved to: {alignment_path}")
print(f"SRT saved to: {srt_path}")


## 4. Preview Aligned Segments
Scroll through the first few entries and view thumbnail frames.


In [None]:
from IPython.display import HTML

frame_dir = OUTPUT_DIR / "frames"
preview_rows = []
for entry in alignment[:10]:
    frame_path = frame_dir / f"frame_{entry['start_frame']:04d}.jpg"
    if frame_path.exists():
        img_html = f'<img src="{frame_path.as_posix()}" width="240" />'
    else:
        img_html = "<em>N/A</em>"
    preview_rows.append(
        "<tr><td>{:.2f}</td><td>{:.2f}</td><td>{}</td><td>{}</td></tr>".format(
            entry['start_time'], entry['end_time'], entry['text'], img_html
        )
    )

if not preview_rows:
    preview_rows.append(
        "<tr><td colspan=4><em>No segments detected.</em></td></tr>"
    )

table_html = (
    "<table>"
    "  <thead>"
    "    <tr><th>Start (s)</th><th>End (s)</th><th>Text</th><th>Frame</th></tr>"
    "  </thead>"
    "  <tbody>"
    + "\n".join(preview_rows)
    + "  </tbody>"
    + "</table>"
)

HTML(table_html)


## 5. Download Results
Save the alignment JSON and SRT locally.


In [None]:
from google.colab import files

files.download(str(alignment_path))
files.download(str(srt_path))


## 6. Optional Cleanup
Remove extracted frames and intermediates if you are finished.


In [None]:
import shutil

shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
print("Workspace cleaned up.")
