In [1]:
import cv2
import os
from os import listdir
import random
import re
from pytube import YouTube
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import shutil
import moviepy.editor as mp
import subprocess

In [22]:
yt = YouTube("https://www.youtube.com/watch?v=yLH4b038eiA")
video = yt.streams.order_by("mime_type")
video

[<Stream: itag="139" mime_type="audio/mp4" abr="48kbps" acodec="mp4a.40.5" progressive="False" type="audio">, <Stream: itag="140" mime_type="audio/mp4" abr="128kbps" acodec="mp4a.40.2" progressive="False" type="audio">, <Stream: itag="251" mime_type="audio/webm" abr="160kbps" acodec="opus" progressive="False" type="audio">, <Stream: itag="17" mime_type="video/3gpp" res="144p" fps="7fps" vcodec="mp4v.20.3" acodec="mp4a.40.2" progressive="True" type="video">, <Stream: itag="18" mime_type="video/mp4" res="360p" fps="30fps" vcodec="avc1.42001E" acodec="mp4a.40.2" progressive="True" type="video">, <Stream: itag="22" mime_type="video/mp4" res="720p" fps="30fps" vcodec="avc1.64001F" acodec="mp4a.40.2" progressive="True" type="video">, <Stream: itag="137" mime_type="video/mp4" res="1080p" fps="30fps" vcodec="avc1.640028" progressive="False" type="video">, <Stream: itag="136" mime_type="video/mp4" res="720p" fps="30fps" vcodec="avc1.64001f" progressive="False" type="video">, <Stream: itag="135"

In [23]:
def download_video(url, save_path, resolution=None):
    yt = YouTube(url)
    if resolution:
        video = yt.streams.filter(res = resolution).first()
    else:
        video = yt.streams.filter(mime_type="video/mp4").order_by("resolution").desc().first()

    # Reformat the video name
    video_name = video.default_filename.replace(" ", "").replace("/", "_").replace("-", "_")

    # Split the name and the extension
    name_part, ext_part = os.path.splitext(video_name)

    # Remove non-alphanumeric and non-underscore characters from the name part
    name_part = re.sub(r'\W+', '', name_part)

    # Join the name part and the extension part
    video_name = name_part + ext_part
    video_file_path = os.path.join(save_path, video_name)
    
    # if video does not exist, download it
    if not os.path.isfile(os.path.join(save_path, video_name)):
        print(f'Downloading video {video_name}...')
        video.download(output_path=save_path, filename=video_name)
    else:
        print(f'Video {video_name} already exists.')

    # If the downloaded video is in WebM format, convert it to MP4 using FFmpeg
    if ext_part.lower() == '.webm' and not os.path.isfile(os.path.splitext(video_file_path)[0] + '.mp4'):
        mp4_output_path = os.path.splitext(video_file_path)[0] + '.mp4'
        print("converting")
        subprocess.run(['ffmpeg', '-i', video_file_path, '-c:v', 'libx264', '-c:a', 'aac', mp4_output_path], check=True)
        os.remove(video_file_path)  # Remove the original WebM file

        return mp4_output_path

    return video_file_path

#### cut downloaded video to specified interval to test model performance ####
def cut_video(video_path, output_path,start_time, end_time, fps):
    # Get the file extension of the input video
    _, file_extension = os.path.splitext(video_path)

    # Load the video clip
    clip = mp.VideoFileClip(video_path).subclip(start_time, end_time)

    # Choose the appropriate codec based on the file extension
    codec = 'libx264' if file_extension == '.mp4' else 'libvpx-vp9'  # For .mp4 use H.264, for others use VP9

    # Write the trimmed video to the output file with the selected codec
    clip.write_videofile(output_path, codec=codec, fps = fps)
    
def extract_frames(video_path, frames_dir, num_frames, total_seconds, start_time, end_time):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Calculate the frame indices for the specified start and end times
    start_frame = int(start_time * fps)
    end_frame = int(end_time * fps)

    # Generate a list of all frame indices
    all_frames = list(range(int(total_seconds * fps)))

    # Remove the frame indices that fall into the specified interval
    available_frames = [f for f in all_frames if f < start_frame or f >= end_frame]

    # Randomly select frame indices from the available frames
    frame_indices = random.sample(available_frames, num_frames)

    # Create a separate folder for each video
    video_name = os.path.splitext(os.path.basename(video_path))[0]
    video_frames_dir = os.path.join(frames_dir, video_name)
    os.makedirs(video_frames_dir, exist_ok=True)

    for i, frame_idx in enumerate(frame_indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()

        if not ret:
            print(f"Frame at position {frame_idx} could not be read.")
            continue

        # Save the frame within the video-specific folder
        frame_name = f"frame_{video_name}_{i}.jpg"
        cv2.imwrite(os.path.join(video_frames_dir, frame_name), frame)

    
def process_video(video_url, 
                  clip_video=False,     # whether to save a clipped version (for model testing, et.)
                  save_full_video=False,# whether to save the full video
                  resolution = None,    # format: '1440p', '1080p', '720p' ...
                  video_save_path='./videos_full',
                  clip_save_path='./videos_clipped', 
                  frames_dir='./yt_frames',  # directory to store video frames (for model training)
                  num_frames=20,        # number of frames to extract
                  interval_length=8):

    # Download video
    print("downloading video")
    video_path = download_video(video_url, video_save_path, resolution)

    # Get video properties
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    total_seconds = int(total_frames / cap.get(cv2.CAP_PROP_FPS))
    fps = round(cap.get(cv2.CAP_PROP_FPS))
    print('fps ', fps)
    
    # Calculate the available duration for the interval
    available_duration = total_seconds - interval_length

    if available_duration <= 0:
        print("Video duration is shorter than the specified interval.")
        return

    # Generate the random start time for the interval
    start_time = random.randint(0, available_duration)

    # Calculate the end time based on the start time and interval length
    end_time = start_time + interval_length
    
    # Extract frames before trimming the video
    print("extracting frames from {} to {}".format(start_time, end_time))
    
    # skip if frames already exist
    if os.path.isdir(os.path.join(frames_dir, os.path.splitext(os.path.basename(video_path))[0])):
        print("frames already exist")
    else:
        extract_frames(video_path, frames_dir, num_frames, total_seconds, start_time, end_time)

    # Clip the video if requested
    if clip_video:
        print("clipping video")
        trimmed_video_path = os.path.join(clip_save_path, f"{os.path.splitext(os.path.basename(video_path))[0]}_trimmed.mp4")
        cut_video(video_path, trimmed_video_path, start_time, end_time, fps)

    # Save the whole video if requested
    if not save_full_video:
        os.remove(video_path)


In [None]:
# Set parameters
urls = ["https://www.youtube.com/watch?v=yLH4b038eiA",
        "https://www.youtube.com/watch?v=16y7o8lATVg&t=14s",
        "https://www.youtube.com/watch?v=iXKU4anCWgI",
        "https://www.youtube.com/watch?v=rQ9F3q8P_7Q",
        "https://www.youtube.com/watch?v=eidrwAskGe0",
        "https://www.youtube.com/watch?v=C6ocY4pFGPQ"
        ]
# All of 2023/07/20, and Game 5, 6 from 7/17

for url in urls:
        try:
                process_video(url)
        except Exception as e:
                print(e)

In [15]:
url = "https://www.youtube.com/watch?v=e12qXHqlbO8&t=64s"
process_video(url, clip_video=False, save_full_video=True)

downloading video
Original Video Name: 20230626 - Game 4.webm
Downloading video 20230626_Game4.webm...
converting


ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 14.0.3 (clang-1403.0.22.14.1)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.0 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --di

fps  30
extracting frames from 492 to 500
