<a href="https://colab.research.google.com/github/muhammadnouman911/Dataset-Preprocessing/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install Dependencies
!pip install yt-dlp ffmpeg-python pandas opencv-python


Collecting yt-dlp
  Downloading yt_dlp-2025.8.11-py3-none-any.whl.metadata (175 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.5/175.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading yt_dlp-2025.8.11-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: yt-dlp, ffmpeg-python
Successfully installed ffmpeg-python-0.2.0 yt-dlp-2025.8.11


In [None]:
import os
import cv2
import pandas as pd
import subprocess
from pathlib import Path


In [None]:
# Step 2: Define Directories
BASE_DIR = Path("/content/drive/MyDrive/Dataset Processing")
VIDEOS_DIR = BASE_DIR / "videos"
BASE_DIR.mkdir(exist_ok=True)
VIDEOS_DIR.mkdir(exist_ok=True)

In [None]:
# Step 3: Helper Functions

def download_and_trim(video_id, start, end, out_path):
    """Download YouTube video and trim to required segment using ffmpeg."""
    url = f"https://www.youtube.com/watch?v={video_id}"
    temp_path = f"temp_{video_id}.mp4"

    # Download best quality
    subprocess.run([
        "yt-dlp", "-f", "bestvideo+bestaudio/best",
        "-o", temp_path, url
    ])

    # Trim segment
    subprocess.run([
        "ffmpeg", "-y", "-i", temp_path,
        "-ss", str(start), "-to", str(end),
        "-c:v", "libx264", "-c:a", "aac", out_path
    ])

    # Remove temp full video
    os.remove(temp_path)

def extract_audio(video_path, audio_path):
    """Extract audio track from video."""
    subprocess.run([
        "ffmpeg", "-y", "-i", video_path,
        "-q:a", "0", "-map", "a", audio_path
    ])

def extract_face(video_path, x, y, w, h, face_path):
    """Extract one frame (middle) and crop face using given coordinates."""
    cap = cv2.VideoCapture(str(video_path))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    mid_frame = frame_count // 2

    cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame)
    ret, frame = cap.read()
    cap.release()

    if not ret:
        print(f"Could not read frame from {video_path}")
        return

    # Crop using provided coordinates
    face = frame[y:y+h, x:x+w]
    cv2.imwrite(str(face_path), face)

In [None]:
# Step 4: Main Processing Function

def process_csv(csv_file):
    df = pd.read_csv(csv_file)

    for idx, row in df.iterrows():
        folder_name = f"A{idx+1}"
        out_dir = BASE_DIR / folder_name
        out_dir.mkdir(exist_ok=True)

        # Define paths
        video_out = VIDEOS_DIR / f"{folder_name}.mp4"
        audio_out = out_dir / "audio.wav"
        face_out = out_dir / "face.jpg"

        # Download and process
        print(f"Processing {row['video_id']} -> {folder_name}")
        download_and_trim(row['video_id'], row['start'], row['end'], str(video_out))
        extract_audio(str(video_out), str(audio_out))
        extract_face(str(video_out), int(row['x']), int(row['y']), int(row['width']), int(row['height']), str(face_out))

    print("\n✅ Dataset creation complete!")