<a href="https://colab.research.google.com/github/muhammadnouman911/Dataset-Preprocessing/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import pandas as pd

# Upload XLSX file
uploaded = files.upload()

# Convert to CSV
excel_file = list(uploaded.keys())[0]
df = pd.read_excel(excel_file)
csv_file = 'data.csv'
df.to_csv(csv_file, index=False)
print("✅ Excel converted to CSV")


Saving Book1 (2).xlsx to Book1 (2) (1).xlsx
✅ Excel converted to CSV


In [None]:
!pip install yt-dlp ffmpeg-python tqdm



In [None]:
0import os
import cv2
import pandas as pd
import yt_dlp
import ffmpeg
from tqdm import tqdm
import time
import shutil
from multiprocessing import Pool, cpu_count
import zipfile

# --- Configuration ---
CSV_PATH = "/content/data.csv"
OUTPUT_DIR = "dataset"
AUDIO_DIR = os.path.join(OUTPUT_DIR, "audio")
VIDEO_DIR = os.path.join(OUTPUT_DIR, "video")
FACE_DIR = os.path.join(OUTPUT_DIR, "faces")
FULL_VIDEO_DIR = os.path.join(OUTPUT_DIR, "full_videos")
COOKIES_FILE = "cookies.txt" if os.path.exists("cookies.txt") else None
LOG_FILE = "failed_downloads.log"
MAX_RETRIES = 2
PROXY = None  # "socks5://127.0.0.1:1080" if you need a proxy
WORKERS = max(1, cpu_count() - 2)  # Use most CPUs but leave 2 free

# Create directories
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)
os.makedirs(FULL_VIDEO_DIR, exist_ok=True)
os.makedirs(FACE_DIR, exist_ok=True)

# --- Initialize Logging ---
def log_failure(youtube_id, reason):
    with open(LOG_FILE, "a") as f:
        f.write(f"{youtube_id},{reason}\n")

# --- Video Processing Functions ---
def download_video(youtube_id, output_path, retry=0):
    ydl_opts = {
        "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
        "outtmpl": output_path,
        "quiet": True,
        "ignoreerrors": True,
        "extractor_args": {"youtube": {"skip": ["hls", "dash"]}},
        "socket_timeout": 30,
        "retries": 3,
    }

    if COOKIES_FILE:
        ydl_opts["cookiefile"] = COOKIES_FILE
    if PROXY:
        ydl_opts["proxy"] = PROXY

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([f"https://www.youtube.com/watch?v={youtube_id}"])
        return os.path.exists(output_path)
    except Exception as e:
        if retry < MAX_RETRIES:
            time.sleep(2)
            return download_video(youtube_id, output_path, retry + 1)
        log_failure(youtube_id, f"Download failed: {str(e)}")
        return False

def process_video_segment(row):
    youtube_id, start, end, x, y = row
    base_name = f"{youtube_id}_{start}_{end}"
    audio_path = os.path.join(AUDIO_DIR, f"{base_name}.wav")
    segment_video_path = os.path.join(VIDEO_DIR, f"{base_name}.mp4")
    face_dir = os.path.join(FACE_DIR, base_name)
    full_video_path = os.path.join(FULL_VIDEO_DIR, f"{youtube_id}_full.mp4")

    # Skip if already processed
    if os.path.exists(audio_path) and os.path.exists(face_dir) and len(os.listdir(face_dir)) > 0:
        return True

    # Download full video if needed
    if not os.path.exists(full_video_path):
        if not download_video(youtube_id, full_video_path):
            return False

    # Extract segment
    try:
        (
            ffmpeg.input(full_video_path, ss=start, to=end)
            .output(segment_video_path, c="copy")
            .run(quiet=True, overwrite_output=True)
        )
    except Exception as e:
        log_failure(youtube_id, f"Segment extraction failed: {str(e)}")
        return False

    # Extract audio
    try:
        (
            ffmpeg.input(segment_video_path)
            .output(audio_path, ac=1, ar=16000, acodec="pcm_s16le")
            .run(quiet=True, overwrite_output=True)
        )
    except Exception as e:
        log_failure(youtube_id, f"Audio extraction failed: {str(e)}")
        return False

    # Extract faces
    try:
        cap = cv2.VideoCapture(segment_video_path)
        os.makedirs(face_dir, exist_ok=True)
        frame_count = 0
        fps = cap.get(cv2.CAP_PROP_FPS)

        # Calculate total frames to process (1 face per second)
        total_frames = int((float(end) - float(start)) * fps)
        frames_to_process = min(total_frames, int(float(end) - float(start)))  # 1 face per second

        for frame_num in range(frames_to_process):
            # Set frame position
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num * (total_frames // frames_to_process))

            ret, frame = cap.read()
            if not ret:
                break

            h, w = frame.shape[:2]
            face_x = int(float(x) * w)
            face_y = int(float(y) * h)

            # Extract 224x224 face centered at (face_x, face_y)
            y1, y2 = max(0, face_y - 112), min(h, face_y + 112)
            x1, x2 = max(0, face_x - 112), min(w, face_x + 112)
            face = frame[y1:y2, x1:x2]

            if face.size == 0:
                continue

            face = cv2.resize(face, (224, 224))
            cv2.imwrite(os.path.join(face_dir, f"frame_{frame_count:04d}.jpg"), face)
            frame_count += 1

        cap.release()
        if frame_count == 0:
            raise Exception("No faces extracted")
    except Exception as e:
        log_failure(youtube_id, f"Face extraction failed: {str(e)}")
        return False

    return True

# --- Dataset Packaging Function ---
def package_dataset(output_zip="dataset.zip"):
    """Package the processed dataset into a ZIP file"""
    print("\nPackaging dataset...")
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(OUTPUT_DIR):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, start=OUTPUT_DIR)
                zipf.write(file_path, arcname=os.path.join(OUTPUT_DIR, arcname))
    print(f"Dataset packaged to {output_zip}")
    return output_zip

# --- Main Execution ---
if __name__ == "__main__":
    # Clear previous log
    with open(LOG_FILE, "w") as f:
        f.write("youtube_id,reason\n")

    # Load and prepare data
    df = pd.read_csv(CSV_PATH, header=None)
    df.columns = ["youtube_id", "start", "end", "x", "y"]
    rows_list = list(df.itertuples(index=False, name=None))

    # Process with multiprocessing
    print(f"Starting processing with {WORKERS} workers...")
    with Pool(WORKERS) as pool:
        results = list(tqdm(
            pool.imap(process_video_segment, rows_list),
            total=len(rows_list),
            unit="video"
        ))

    # Generate report
    success = sum(results)
    failure = len(results) - success

    # Package the dataset
    zip_path = package_dataset()

    # Final report
    print("\nProcessing complete!")
    print(f"Successfully processed: {success} videos")
    print(f"Failed: {failure} videos (see {LOG_FILE})")
    print(f"\nDataset components:")
    print(f"- Audio files: {AUDIO_DIR}")
    print(f"- Video segments: {VIDEO_DIR}")
    print(f"- Face crops: {FACE_DIR}")
    print(f"- Full videos: {FULL_VIDEO_DIR}")
    print(f"\nComplete dataset packaged at: {zip_path}")

Starting processing with 1 workers...


  0%|          | 0/50 [00:00<?, ?video/s]ERROR: [youtube] u5MPyrRJPmc: The uploader has not made this video available in your country
This video is available in Andorra, United Arab Emirates, Afghanistan, Antigua and Barbuda, Anguilla, Albania, Armenia, Angola, Antarctica, Argentina, American Samoa, Austria, Australia, Aruba, Åland Islands, Azerbaijan, Bosnia and Herzegovina, Barbados, Bangladesh, Belgium, Burkina Faso, Bulgaria, Bahrain, Burundi, Benin, Saint Barthélemy, Bermuda, Brunei Darussalam, Bolivia, Plurinational State of, Bonaire, Sint Eustatius and Saba, Brazil, Bahamas, Bhutan, Bouvet Island, Botswana, Belarus, Belize, Canada, Cocos (Keeling) Islands, Congo, the Democratic Republic of the, Central African Republic, Congo, Switzerland, Côte d'Ivoire, Cook Islands, Chile, Cameroon, China, Colombia, Costa Rica, Cuba, Cape Verde, Curaçao, Christmas Island, Cyprus, Czech Republic, Germany, Djibouti, Denmark, Dominica, Dominican Republic, Algeria, Ecuador, Estonia, Egypt, Wester



 16%|█▌        | 8/50 [00:20<02:09,  3.08s/video]



 18%|█▊        | 9/50 [00:40<04:27,  6.51s/video]



 20%|██        | 10/50 [00:52<05:09,  7.74s/video]ERROR: [youtube] SGJz8ysQXlQ: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
 22%|██▏       | 11/50 [00:53<03:57,  6.09s/video]



 24%|██▍       | 12/50 [01:03<04:26,  7.01s/video]



 26%|██▌       | 13/50 [01:11<04:34,  7.42s/video]ERROR: [youtube] qBXZwYjeNAw: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
 28%|██▊       | 14/50 [01:12<03:22,  5.63s/video]



 30%|███       | 15/50 [01:27<04:44,  8.12s/video]



 32%|███▏      | 16/50 [01:39<05:18,  9.36s/video]



 34%|███▍      | 17/50 [02:01<07:12, 13.09s/video]



 36%|███▌      | 18/50 [02:20<07:53, 14.79s/video]



 38%|███▊      | 19/50 [02:35<07:40, 14.84s/video]



 40%|████      | 20/50 [02:48<07:08, 14.29s/video]ERROR: [youtube] mbbYk4d_6AY: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
 42%|████▏     | 21/50 [02:49<04:59, 10.33s/video]



 44%|████▍     | 22/50 [03:01<05:00, 10.72s/video]



 46%|████▌     | 23/50 [03:10<04:41, 10.41s/video]



 48%|████▊     | 24/50 [03:20<04:26, 10.23s/video]



 50%|█████     | 25/50 [03:31<04:23, 10.55s/video]



 52%|█████▏    | 26/50 [03:40<03:59,  9.98s/video]ERROR: [youtube] 3foAjdMgqt0: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
 54%|█████▍    | 27/50 [03:41<02:47,  7.27s/video]



 56%|█████▌    | 28/50 [04:07<04:44, 12.95s/video]



 58%|█████▊    | 29/50 [04:14<03:53, 11.12s/video]



 60%|██████    | 30/50 [04:24<03:33, 10.66s/video]



 62%|██████▏   | 31/50 [04:29<02:50,  8.97s/video]ERROR: [youtube] BrDNUMkmsSI: Video unavailable
 64%|██████▍   | 32/50 [04:30<01:57,  6.52s/video]ERROR: [youtube] YExC6Y31ZeM: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
 66%|██████▌   | 33/50 [04:30<01:21,  4.82s/video]



 68%|██████▊   | 34/50 [04:38<01:30,  5.67s/video]



 70%|███████   | 35/50 [04:58<02:30, 10.04s/video]



 72%|███████▏  | 36/50 [05:24<03:25, 14.67s/video]



 74%|███████▍  | 37/50 [05:37<03:06, 14.32s/video]



 76%|███████▌  | 38/50 [05:45<02:28, 12.34s/video]



 78%|███████▊  | 39/50 [05:58<02:19, 12.69s/video]



 80%|████████  | 40/50 [06:11<02:05, 12.56s/video]ERROR: [youtube] MoesGmkODgY: Video unavailable. This video is no longer available because the YouTube account associated with this video has been terminated.
 82%|████████▏ | 41/50 [06:12<01:21,  9.09s/video]



 84%|████████▍ | 42/50 [06:21<01:13,  9.23s/video]



 86%|████████▌ | 43/50 [06:35<01:13, 10.45s/video]



 88%|████████▊ | 44/50 [06:43<00:58,  9.72s/video]



 90%|█████████ | 45/50 [06:55<00:52, 10.44s/video]



 92%|█████████▏| 46/50 [07:03<00:39,  9.76s/video]



 94%|█████████▍| 47/50 [07:11<00:27,  9.13s/video]



 96%|█████████▌| 48/50 [07:20<00:18,  9.32s/video]ERROR: [youtube] HMGg-84WPAQ: Private video. Sign in if you've been granted access to this video. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
 98%|█████████▊| 49/50 [07:21<00:06,  6.78s/video]



100%|██████████| 50/50 [07:48<00:00,  9.37s/video]



Packaging dataset...
Dataset packaged to dataset.zip

Processing complete!
Successfully processed: 39 videos
Failed: 11 videos (see failed_downloads.log)

Dataset components:
- Audio files: dataset/audio
- Video segments: dataset/video
- Face crops: dataset/faces
- Full videos: dataset/full_videos

Complete dataset packaged at: dataset.zip


In [None]:
from google.colab import files
import os

# Path to your dataset zip file (same as created in previous step)
ZIP_PATH = "dataset.zip"

def download_dataset():
    # Verify the zip file exists
    if not os.path.exists(ZIP_PATH):
        raise FileNotFoundError(f"Dataset zip file not found at {ZIP_PATH}")

    # For Google Colab
    try:
        from google.colab import files
        print("Downloading dataset from Google Colab...")
        files.download(ZIP_PATH)
        return True
    except:
        pass

    # For local Python environment
    try:
        import webbrowser
        import urllib.parse
        print("Opening dataset in browser for download...")
        webbrowser.open(f"file://{urllib.parse.quote(os.path.abspath(ZIP_PATH))}")
        return True
    except:
        pass

    # Fallback - just print the path
    print(f"Couldn't initiate automatic download. Please manually download from: {os.path.abspath(ZIP_PATH)}")
    return False

# Add this at the end of your main script (after packaging)
if __name__ == "__main__":
    # ... [your existing code] ...

    # After packaging isa complete
    print("\nWould you like to download the dataset now? (y/n)")
    response = input().strip().lower()
    if response == 'y':
        if download_dataset():
            print("Download initiated successfully!")
        else:
            print("Could not initiate automatic download. Please check the file path.")


Would you like to download the dataset now? (y/n)
y
Downloading dataset from Google Colab...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download initiated successfully!
