In [4]:
import os
import time
import random

import pandas as pd
import numpy as np
from tqdm import tqdm
from pytubefix import YouTube
from pytubefix.exceptions import LoginRequired, VideoUnavailable
from moviepy.video.io.VideoFileClip import VideoFileClip

In [5]:
classes = pd.read_json('Data/MS-ASL/MSASL_classes.json')
train_data = pd.read_json('Data/MS-ASL/MSASL_train.json')

In [6]:
def is_video_available(url, max_retries=3):
    '''
    Check if a YouTube video is available.
    Returns True if accesible, False if unavailable or private.
    '''
    for attempt in range(max_retries):
        try:
            time.sleep(random.uniform(1,3)) # Delay to avoid rate limits
            yt = YouTube(url, client='WEB')
            yt.check_availability() # Check if the video can be accessed
            return True
        except (LoginRequired, VideoUnavailable):
            return False
        except Exception as e:
            tqdm.write(f"Attempt {attempt + 1}/{max_retries} failed for {url}: {e}")
            time.sleep(2)
    return False


In [7]:
# Progerss bar for checking URLs in url_dict
tqdm.pandas(desc='Checking YouTube Video Availability')

# create a key-value pari for YouTube URLs
url_dict = {url: i for i, url in enumerate(train_data['url'].unique())}

# Filter url_dict to remove unavailable URLs
filtered_url_dict = {url: i for url, i in tqdm(url_dict.items(), desc="Filtering URLS") if is_video_available(url)}

# Remove rows from train_data that contain unavailable URLs
train_data = train_data[train_data['url'].isin(filtered_url_dict.keys())]

# Reset index after removal
train_data.reset_index(drop=True, inplace=True)

Filtering URLS:   0%|                                                              | 0/4171 [00:00<?, ?it/s]Unable to run botGuard. Skipping poToken generation
Unable to run botGuard. Skipping poToken generation
The WEB client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
Filtering URLS:   0%|                                                    | 1/4171 [00:04<5:32:16,  4.78s/it]Unable to run botGuard. Skipping poToken generation
Unable to run botGuard. Skipping poToken generation
The WEB client requires PoToken to obtain functional streams, See more details at https://github.com/JuanBindez/pytubefix/pull/209
Filtering URLS:   0%|                                                    | 2/4171 [00:08<4:37:15,  3.99s/it]Unable to run botGuard. Skipping poToken generation
Unable to run botGuard. Skipping poToken generation
The WEB client requires PoToken to obtain functional streams, See more details at https://github.com/J

KeyboardInterrupt: 

In [11]:
# add column containig the youtube id
train_data['url_id'] = train_data['url'].map(filtered_url_dict)

# Save the data 
train_data.to_csv('url_cleaned_MSASL_train.csv', index=False)

In [14]:
def download_video(url, output_filename, output_directory, max_iters=3):
    '''
    Downlaod a YouTube video to a specified directory.

    Parameters:
    - url (str): The YouTube video URL.
    - output_filename (str): The name of the output file.
    - output_directory (str): The directory where the video will be saved.

    Returns:
    - str: The full path of the downloaded video.
    '''
    for attempt in range(max_iters):
        try: 
            # Ensure output directory exists
            os.makedirs(output_directory, exist_ok=True)
        
            # Create the full output file path
            output_path = os.path.join(output_directory, output_filename + '.mp4')
            
            # Delay to avoid rate limits
            time.sleep(random.uniform(1,3))
        
            # Initialize the YouTube object
            yt = YouTube(url, client='WEB')
        
            # Get the highest resolution stream
            video = yt.streams.get_highest_resolution()
        
            # Download the video to the specified directory with the specified filename
            video.download(output_path=output_directory, filename=output_filename + '.mp4')
        
            return output_path
    
        except LoginRequired:
            print(f"Skipping {url}, {output_filename}: Login required (private or restricted video)")
            return None
    
        except VideoUnavailable:
            print(f"Skipping {url}, {output_filename}: Video is unavailable (deleted, region-locked or restrcited)")
            return None
        return None
    

In [29]:
# download the video from the url from the url_dict
directory = 'Data/train_videos'
with tqdm(total=len(filtered_url_dict), desc='Downloading Videos', unit='video') as pbar:
    for url, url_id in filtered_url_dict.items():
        download_video(url, str(url_id), directory)
        pbar.update(1)
print('Done')

Downloading Videos:   0%|                     | 4/3424 [00:20<4:45:22,  5.01s/video]


KeyboardInterrupt: 

In [41]:
new_dict = {url:url_id for url, url_id in filtered_url_dict.items() if url_id >= 244}
with tqdm(total=len(new_dict), desc='Downloading Remaining Video', unit='video') as pbar:
    for url, url_id in new_dict.items():
        download_video(url, str(url_id), directory)
        pbar.update(1)
print("Done")

Downloading Remaining Video:   8%|▊         | 246/3228 [20:32<4:08:59,  5.01s/video]


KeyboardInterrupt: 

In [31]:
new_dict

{'https://www.youtube.com/watch?v=VSS0sARFsI8': 242,
 'https://www.youtube.com/watch?v=1NRAaw1bh98': 244,
 'https://www.youtube.com/watch?v=_3uIYCAkNL8': 245,
 'https://www.youtube.com/watch?v=PZxB1gThfhQ': 246,
 'https://www.youtube.com/watch?v=meY_uKNmWFI': 247,
 'www.youtube.com/watch?v=3aXS3keR8oY': 248,
 'https://www.youtube.com/watch?v=1PHSaRIwArw': 249,
 'https://www.youtube.com/watch?v=2llgG-yP_nE': 250,
 'https://www.youtube.com/watch?v=QUk_N6cOwoc': 251,
 'https://www.youtube.com/watch?v=KLgjyeFE6Bs': 252,
 'https://www.youtube.com/watch?v=MeIk9lFGPTo': 253,
 'https://www.youtube.com/watch?v=XjWSfh50kAU': 254,
 'https://www.youtube.com/watch?v=mA15ML-vAd0': 257,
 'https://www.youtube.com/watch?v=QUF1JHzBXhw': 259,
 'www.youtube.com/watch?v=9ul_1z23KLI': 260,
 'www.youtube.com/watch?v=glJmYf137OM': 262,
 'https://www.youtube.com/watch?v=_s96-SeiXfo': 263,
 'https://www.youtube.com/watch?v=cqyB9f32GuI': 264,
 'https://www.youtube.com/watch?v=IQMc7srwiD8': 265,
 'https://www.you

In [40]:
download_video("https://www.youtube.com/watch?v=VSS0sARFsI8", str(242), directory)

RemoteDisconnected: Remote end closed connection without response

In [36]:
@TODO
#Download 242 video and put it inside the train_videos
# Look at the "Data/train_videos" and start from wherever I left off

SyntaxError: unexpected EOF while parsing (1824158146.py, line 2)