In [1]:
import os
from yt_dlp import YoutubeDL
from tqdm import tqdm

In [2]:
LINKS_FILE = "links.txt"
OUTPUT_DIR = "video_dataset"

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Read links ignoring comments and blanks
def read_links():
    if not os.path.exists(LINKS_FILE):
        print(f"File {LINKS_FILE} not found.")
        return []
    with open(LINKS_FILE, 'r') as f:
        return [line.strip() for line in f if line.strip() and not line.strip().startswith('#')]

In [4]:
# Tqdm progress hook class
class TqdmHook:
    def __init__(self, tqdm_bar):
        self.pbar = tqdm_bar

    def hook(self, d):
        if d['status'] == 'downloading':
            downloaded = d.get('downloaded_bytes', 0)
            total = d.get('total_bytes') or d.get('total_bytes_estimate')
            if total and self.pbar.total != total:
                self.pbar.total = total
            self.pbar.n = downloaded
            self.pbar.refresh()
        elif d['status'] == 'finished':
            self.pbar.n = self.pbar.total
            self.pbar.refresh()
            self.pbar.close()

In [5]:
# Download with progress, and skip if file exists
def download_video_with_progress(url, index, total):
    try:
        # Get metadata (title, etc.)
        ydl_info_opts = {'quiet': True, 'skip_download': True}
        with YoutubeDL(ydl_info_opts) as ydl:
            info = ydl.extract_info(url, download=False)
            title = info.get('title', 'video')
            filename = f"{title}.mp4"
            filepath = os.path.join(OUTPUT_DIR, filename)

        if os.path.exists(filepath):
            print(f"✅ [{index}/{total}] Skipped: {filename} (already exists)")
            return

        # Start download with progress bar
        desc = f"[{index}/{total}] {title[:50]}"
        pbar = tqdm(total=100, desc=desc, unit='B', unit_scale=True, dynamic_ncols=True)
        hook = TqdmHook(pbar)

        ydl_opts = {
            'format': 'bestvideo+bestaudio/best',
            'outtmpl': os.path.join(OUTPUT_DIR, '%(title)s.%(ext)s'),
            'merge_output_format': 'mp4',
            'progress_hooks': [hook.hook],
            'quiet': True,
            'noprogress': True,
        }

        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

    except Exception as e:
        print(f"❌ [{index}/{total}] Failed: {url}\n   Error: {e}")

In [6]:
# Run it all
urls = read_links()
total = len(urls)

In [7]:
if not urls:
    print("No links found.")
else:
    for idx, link in enumerate(urls, start=1):
        download_video_with_progress(link, idx, total)
    print("🏁 Done — all videos processed.")

[1/36] CS2 Surf Aquaflow: 100%|██████████| 166M/166M [00:19<00:00, 8.71MB/s]    
[2/36] CS2 Surf Cyberwave: 100%|██████████| 282M/282M [00:29<00:00, 9.64MB/s]    
[3/36] CS2 Surf Boreas: 100%|██████████| 142M/142M [00:18<00:00, 7.55MB/s]    
[4/36] CS2 Surf Astra: 100%|██████████| 27.5M/27.5M [00:10<00:00, 2.72MB/s]  
[5/36] CS2 Surf Nyx: 100%|██████████| 48.7M/48.7M [00:10<00:00, 4.78MB/s]  
[6/36] CS2 Surf Glass9: 100%|██████████| 101M/101M [00:15<00:00, 6.33MB/s]    
[7/36] CS2 Surf Me: 100%|██████████| 306M/306M [00:38<00:00, 8.03MB/s]    
[8/36] surf_mesa_aether WR. Surfed by rulldar: 100%|██████████| 217M/217M [00:55<00:00, 3.90MB/s]    
[9/36] surf_limbo WR. Surfed by Novaa: 100%|██████████| 192M/192M [01:55<00:00, 1.66MB/s]    
[10/36] Surfing Dreams: 100%|██████████| 878M/878M [01:30<00:00, 9.71MB/s]     
[11/36] surf_runewords WR. Surfed by Caff: 100%|██████████| 224M/224M [02:32<00:00, 1.47MB/s]    
[12/36] CS2 SURF UTOPIA 55:06 (Cybershoke Rank #85): 100%|██████████| 16.4M/

🏁 Done — all videos processed.
