In [1]:
import urllib
from bs4 import BeautifulSoup
from pytube import YouTube
import cv2
import os
import glob
import numpy as np
from youtube_search import YoutubeSearch
import Hyperparameters as hyp

In [2]:
train_path = 'Data/Training2/'
test_path = 'Data/Testing/'
full_broad_urls = [('BayRmd072119', 'https://www.youtube.com/watch?v=kEyqyYJhIIg'), ('WolvesArsenal042419', 'https://www.youtube.com/watch?v=DWYKhbawzQ8')]
highlight_urls = [('EvertonManUtd110920H', 'https://www.youtube.com/watch?v=dM1j3XAhwIE'),
                  ('WolvesManCity122719H', 'https://www.youtube.com/watch?v=OtdjPcLMP5Y'),
                  ('BarBetis110920H', 'https://www.youtube.com/watch?v=KugeDpzfOBY'),
                  ('ChelSeffUtd110820H', 'https://www.youtube.com/watch?v=uyp75pH_mzU'),
                  ('ManUtdTot100520H', 'https://www.youtube.com/watch?v=dnjNhcMsT1c'),
                  ('SouTot092220H', 'https://www.youtube.com/watch?v=Wt-sXiQMGAc')]
clip_len = hyp.CLIP_LEN
train_queries = ["La Liga Highlights", 
                 "Soccer Highlights", 
                 "Serie A Highlights", 
                "Bundesliga highlights", 
                "Russian Premier Liga Highlights", 
                "K-League Highlights", 
                "Eredivisie Highlights", 
                "Ligue 1 Highlights",
                "MLS Highlights", 
                 "EPL Highlights"]
fps = hyp.FPS
dim = (hyp.FRAME_RES, hyp.FRAME_RES)

In [3]:
def downloadVideos(query, path, dim, clip_len=hyp.CLIP_LEN, fps=hyp.FPS, maxNumVids=100):
    """
    Method: downloadVideos
    ----------------------
    This method will scrape youtube according to the query, and will download all of the videos and store them in a list.
    If a channel is specified, the function will perform the search on the youtube channel's page.
    ----------------------
    Arguments:
    query - the string that we want to search on YouTube
    channel - Optional argument that indicates if you want to search on a given channel's page
    ----------------------
    Return: None
    """
    print(f'Scraping videos from the query: {query}')
    results = YoutubeSearch(query, max_results=maxNumVids).to_dict()
    for i, result in enumerate(results):
        uniq_id = result['id']
        print(f"Downloading {result['title']}...")
        if os.path.exists(path + uniq_id + '.npy'):
            print(f'{uniq_id} already downloaded and parsed.')
        else:
            if download_from_url('https://www.youtube.com' + result['url_suffix'], uniq_id, path):
                parseVideo(path + uniq_id + '.mp4', dim, clip_len, fps)
                os.remove(path + uniq_id + '.mp4')

In [4]:
def download_from_url(url, filename, path):
    """
    Method: download_from_url:
    --------------------------
    This method will download a youtube video from a given url and add it to the list of videos.
    --------------------------
    Arguments:
    url - the string form of a url
    --------------------------
    Return: None
    """
    try:
        YouTube(url).streams.first().download(filename=filename, output_path=path)
        print(f"Video {filename} downloaded successfully")
        return True
#     except urllib.error.HTTPError as http:
#         print(http.headers)
    except Exception as exc:
        print(f"Tried to download {filename}, but it did not work because {exc}...")
        return False

In [5]:
def parseVideo(path, dim, clip_len=hyp.CLIP_LEN, fps=hyp.FPS):
    """
    Method: parseVideos
    ----------------------
    This method will break up each videos into a series of clips, and then format the data into a series of matrices.
    ----------------------
    Arguments:
    None
    ----------------------
    Return: numpy array containing the training data extracted from the youtube videos
    """
    counter = 0
    cap = cv2.VideoCapture(path)
    fps_vid = int(cap.get(cv2.CAP_PROP_FPS))
    hasFrames = True
    frames = []
    counter = 1
    while True:
        hasFrames, image = cap.read()
        if not hasFrames:
            break
        grayImg  = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        compr_img = cv2.resize(grayImg, dim)
        if counter % fps_vid == 0:
            frames.append(compr_img)
        counter += 1
    frames = np.array(frames)
    np.save(path[:-4], frames)
    print(f"Video {path} was parsed successfully with shape {frames.shape}")

In [7]:
num_att = 10
for i in range(num_att):
    print(f'Attempt {i+1} out of {num_att} attempts to download the training videos')
    for query in train_queries:
        downloadVideos(query, train_path, dim, clip_len=clip_len, fps=fps)
        print()
for video_name, url in full_broad_urls:
    if download_from_url(url, video_name, test_path):
        parseVideo(test_path + video_name + '.mp4', dim, clip_len=clip_len, fps=fps)
        os.remove(test_path + video_name + '.mp4')

Video BayRmd072119 downloaded successfully
Video Data/Testing/BayRmd072119.mp4 was parsed successfully with shape (6402, 32, 32)
Video WolvesArsenal042419 downloaded successfully
Video Data/Testing/WolvesArsenal042419.mp4 was parsed successfully with shape (6169, 32, 32)
