# Video to Audio (Data Acquisition)

This notebook:
- Downloads lecture videos from a YouTube playlist
- Stores videos in MP4 format
- Converts MP4 videos to MP3 audio using FFmpeg

NOTE:
- This notebook should be run ONLY ONCE.
- Re-run ONLY if the playlist changes.


#### Imports & Paths

In [None]:
import os
import subprocess
from yt_dlp import YoutubeDL


# Base project paths

In [None]:

BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"
VIDEO_DIR = os.path.join(BASE_DIR, "Videos")
AUDIO_DIR = os.path.join(BASE_DIR, "Audios")

os.makedirs(VIDEO_DIR, exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)

print("Directories ready")


#### Convert MP4 â†’ MP3 (Clean & Safe)

In [None]:
video_files = sorted([
    f for f in os.listdir(VIDEO_DIR)
    if f.endswith(".mp4")
])

print(f"Found {len(video_files)} video files for conversion")


In [None]:
for video in video_files:
    input_path = os.path.join(VIDEO_DIR, video)

    lecture_number = video.split(" - ")[0]
    lecture_title = video.split(" - ", 1)[1].replace(".mp4", "")
    output_audio = f"{lecture_number}_{lecture_title}.mp3"
    output_path = os.path.join(AUDIO_DIR, output_audio)

    # Skip if MP3 already exists (IMPORTANT)
    if os.path.exists(output_path):
        print(f"Skipping (already exists): {output_audio}")
        continue

    print(f"Converting: {video}")

    subprocess.run(
        [
            "ffmpeg",
            "-nostdin",
            "-y",
            "-i", input_path,
            output_path
        ],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.STDOUT
    )

print("All videos converted to MP3 successfully!")


In [None]:
print("02_video_to_audio.ipynb completed successfully.")
