In [1]:
# importing packages
from pytube import YouTube, Playlist, extract, helpers
import os
import json

In [2]:
import whisper
import torch  # install steps: pytorch.org

device = "cuda" if torch.cuda.is_available() else "cpu"

from tqdm.auto import tqdm  # !pip install tqdm|

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# https://www.geeksforgeeks.org/download-video-in-mp3-format-using-pytube/
def youtube_url_to_mp3(yt_url, save_path="mp3/"):
    # url input from user
    yt = YouTube(yt_url)
    # extract only audio
    video = yt.streams.filter(only_audio=True).first()

    destination = save_path + '/'
    
    # download the file
    out_file = video.download(output_path=destination)
    
    # save the file
    base, ext = os.path.splitext(out_file) 
    filename = destination + extract.video_id(yt_url) + '.mp3'
    os.rename(out_file, filename) 

    # # save the file
    # base, ext = os.path.splitext(out_file)
    # new_file = base + '.mp3'
    # os.rename(out_file, new_file)

    print('\n created MP3 for' + yt_url + ' at ' + filename)

    # print("Downloaded successfully: \n", new_file)
    return yt, filename 


In [4]:
#karpathy_url = "https://www.youtube.com/watch?v=VMj-3S1tku0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=1&t=112s"
karpathy_url = "https://www.youtube.com/watch?v=caWEOyNsU3Y"
karpathy_obj = youtube_url_to_mp3(karpathy_url)
print (karpathy_obj)



# fastai_url = "https://www.youtube.com/watch?v=F4tvM4Vb3A0&list=PLfYUBJiXbdtSvpQjSnJJ_PmDQB_VyT5iU"
# fastai_obj = youtube_url_to_mp3(fastai_url)
# print (fastai_obj)



 created MP3 forhttps://www.youtube.com/watch?v=caWEOyNsU3Y at mp3//caWEOyNsU3Y.mp3
(<pytube.__main__.YouTube object: videoId=caWEOyNsU3Y>, 'mp3//caWEOyNsU3Y.mp3')


In [5]:
def get_urls_from_youtube_playlist(playlist_url):
    """Returns a list of video urls from a youtube playlist"""
    playlist = Playlist(playlist_url)
    return playlist.video_urls

In [6]:
karpathy_playlist = "https://www.youtube.com/playlist?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ"

karpathy_urls = get_urls_from_youtube_playlist(karpathy_playlist)
karpathy_playlist_id = extract.playlist_id(karpathy_playlist)

karpathy_playlist_vid_ids = [extract.video_id(url) for url in karpathy_urls]

karpathy_playlist_vid_ids

['VMj-3S1tku0', 'PaCmpygFfXo', 'TCH_1BHY58I', 'P6sfmUTpUmc', 'q8SA3rM6ckI']

In [7]:
from importlib.metadata import metadata
from operator import length_hint

In [8]:
def create_yt_metadata(yt, url,person, whisper_result=''):

    vid_json_metadata = {
        "current_person": person,
        "video_id": extract.video_id(url),
        "url": url,
        "title": yt.title,
        "description": yt.description,
        "author": yt.author,
        "keywords": yt.keywords,
        "channel_url": yt.channel_url,
        "length": yt.length,
        "views": yt.views,
        "whisper_result": whisper_result,
    }

    return vid_json_metadata


In [9]:
import whisper
import torch  # install steps: pytorch.org

device = "cuda" if torch.cuda.is_available() else "cpu"

from tqdm.auto import tqdm  # !pip install tqdm|

In [10]:
model = whisper.load_model("tiny.en").to(device) # change to "large.en" for larger model

In [14]:
# Andrej Karpathy 
karpathy_playlist_url = "https://www.youtube.com/playlist?list=PL9dX7Elz2t0-c4Sy7trowtrDlbRWaoi6f"

# Fei Fei Li
feifei_playlist_url = "https://youtube.com/playlist?list=PL9dX7Elz2t08Az4p5MghL1EpfQXs9urPx"

# Yann LeCun
lecun_playlist_url = "https://youtube.com/playlist?list=PL9dX7Elz2t09sz_rNmRDPCux-ZGC09vWS"

In [16]:
## DEFINE CUSTOM VARIABLES FOR THIS PLAYLIST 

# playlist_url = "https://www.youtube.com/playlist?list=PL9zq2zalZB1ID7wBz5fi9cQ8jZB3FH07q" #karpathy shorts 

playlist_url = lecun_playlist_url
current_person = "Yann LeCun"
save_path = "lecun"

# ============================================

# # loop through a playlist 
video_metadata = {}

# ============================================
# THE MAIN LOOP 
# ============================================

vid_urls = get_urls_from_youtube_playlist(playlist_url)

# get all the videos 
for url in vid_urls:

    try: 
    
        ## =======
        ## Save the Video as an MP3
        yt, path = youtube_url_to_mp3(url, save_path)
        print ('yt', yt)
        print ('path', path)

        # transcribe the resulting mp3 file
        # try: 
        #     whisper_result = model.transcribe(path)
        #     print ('created whisper result for ' + path)
        # except Exception as e: 
        #     print ('could not create whisper result for ' + path)
        #     print (e)
        #     whisper_result = ''

        # save a json object for the whisper_result

        # create the json object and then save it 
        yt_video_id = extract.video_id(url)
        vid_json_metadata = create_yt_metadata(yt, url, current_person, whisper_result)
        video_metadata[url] = vid_json_metadata

        print ('created json metadata for ' + yt.title)

    except Exception as e:
        print (e)
    

# create a json object from video_metadata
# json_data = json.dumps(video_metadata)

with open(save_path + '__video_metadata.json', 'w') as f:
    json.dump(video_metadata, f, indent=2)




 created MP3 forhttps://www.youtube.com/watch?v=DokLw1tILlw at lecun/DokLw1tILlw.mp3
yt <pytube.__main__.YouTube object: videoId=DokLw1tILlw>
path lecun/DokLw1tILlw.mp3
created json metadata for Yann LeCun: "A Path Towards Autonomous AI", Baidu 2022-02-22

 created MP3 forhttps://www.youtube.com/watch?v=VRzvpV9DZ8Y at lecun/VRzvpV9DZ8Y.mp3
yt <pytube.__main__.YouTube object: videoId=VRzvpV9DZ8Y>
path lecun/VRzvpV9DZ8Y.mp3
created json metadata for Yann LeCun: From Machine Learning to Autonomous Intelligence
