<a href="https://colab.research.google.com/github/markusstrasser/council/blob/main/Youtube_Playlist_Transcription.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## SETUP

In [None]:
!pip install pytube
!pip install -q git+https://github.com/openai/whisper.git > /dev/null
#@markdown **NVIDIA GPU**
import subprocess
sub_p_res = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,memory.free', '--format=csv,noheader'], stdout=subprocess.PIPE).stdout.decode('utf-8')
print(sub_p_res)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.




## Define Entities (people/organizations/podcasts) and corresponding source videos

In [None]:
sources = {
    "Andrew Kaparthy": ["https://www.youtube.com/watch?v=VMj-3S1tku0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ", #kapathy1
    "https://www.youtube.com/playlist?list=PL9dX7Elz2t0-c4Sy7trowtrDlbRWaoi6f", #kaparthy2
    "https://www.youtube.com/playlist?list=PL9zq2zalZB1ID7wBz5fi9cQ8jZB3FH07q"] #kaparthy3
    ,
    "FastAI": ["https://www.youtube.com/watch?v=F4tvM4Vb3A0&list=PLfYUBJiXbdtSvpQjSnJJ_PmDQB_VyT5iU"],
    "Fei-Fei Li": ["https://youtube.com/playlist?list=PL9dX7Elz2t08Az4p5MghL1EpfQXs9urPx"],
    "Yann LeCun": ["https://youtube.com/playlist?list=PL9dX7Elz2t09sz_rNmRDPCux-ZGC09vWS"]
}

In [None]:
#@markdown **Model and Output Paths**
# ask for the link
print("Local Path Variables:\n")

output_path = "/content/output/" #@param {type:"string"}

#@markdown **Google Drive Path Variables (Optional)**
mount_google_drive = True #@param {type:"boolean"}
force_remount = False

if mount_google_drive:
    from google.colab import drive # type: ignore
    try:
        drive_path = "/content/drive"
        drive.mount(drive_path,force_remount=force_remount)
        output_path_gdrive = "/content/drive/MyDrive/YT/" #@param {type:"string"}
        output_path = output_path_gdrive
    except:
        print("...error mounting drive or with drive path variables")
        print("...reverting to default path variables")

import os

os.makedirs(output_path, exist_ok=True)

print(f"output_path: {output_path}")

Local Path Variables:

Mounted at /content/drive
output_path: /content/drive/MyDrive/YT/


In [None]:
#@markdown **Define UTILS**
import re
from pytube import YouTube, Playlist
import json
import datetime
import os
from operator import attrgetter
from pytube.exceptions import RegexMatchError
from tqdm.auto import tqdm  # !pip install tqdm

def playlist2URLs(yt_url):
    pattern = re.compile("^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube(-nocookie)?\.com|youtu.be))(\/(?:[\w\-]+\?v=|embed\/|v\/)?)([\w\-]+)(\S+)?$")
    """Returns a list of video urls from a youtube playlist"""
    if not pattern.match(yt_url):
        raise Exception("passed URL is not a Youtube URL")
    try:
        playlist = Playlist(yt_url)
        return playlist.video_urls
    except:
        #passed a YT url that is not a playlist
        return [yt_url]

def id2ytURL(video_id): return "https://www.youtube.com/watch?v=" + video_id


def yt_meta(url):
    yt = YouTube(url)

    yt_props = ["video_id",
            "title",
            "description",
            "author",
            "keywords",
            "channel_url",
            "length",
            "views",
            "publish_date",
            ]

    meta= dict(zip(yt_props, attrgetter(*yt_props)(yt)))
    date = meta["publish_date"]
    meta["publish_date"] = date.now().strftime("%m/%d/%Y")
    meta["timestamp"] = round(datetime.datetime.timestamp(date))
    return meta
    

def ytURL2id(url):
    regex = re.compile(r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?(?P<id>[A-Za-z0-9\-=_]{11})')
    match = regex.match(url)
    if not match:
        print('no ID found for YT url')
        return False
    id = match.group('id')
    print(id)
    return id

def playlist2IDs(pl_url):
    return [ytURL2id(url) for url in playlist2URLs(pl_url)]


def URL2mp3(url, entity="", output_path=output_path, hq_audio=False):

    stream_idx = 1 if hq_audio else 0 #hq is 128k, else 48k
    #entity can be a person/podcast/organization etc.
    yt = YouTube(url)
    ID = ytURL2id(url)
    metadata= yt_meta(url)
    metadata["entity"] = entity

    yt.streams.filter(only_audio=True)[stream_idx]\
    .download(
        output_path=output_path,
        filename=f"{ID}.mp3"
    )

    with open(f"{output_path}/{ID}_meta.json", 'w') as f:
        # j = json.dumps(metadata, indent=4, sort_keys=True, default=str) 
        json.dump(metadata, f)

# playlist2IDs("https://www.youtube.com/watch?v=VMj-3S1tku0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=1&t=112s")

## Sources (Entities) as YT playlists

## Download Sources from Youtube as MP3 (with metadata.json)

In [None]:
import os

if False:
    for entity, playlists in sources.items():
        print(entity, playlists)
        for playlist in playlists:
            print(f'{entity}: Playlist {playlist}')
            for video in playlist2URLs(playlist):
                print(f'Video: {video}')
                tqdm(URL2mp3(video, entity=entity))

## Transcribe MP3s (using Whisper) [no speaker separation yet]




In [None]:
import whisper
from os.path import exists

# num_speakers = 2 #@param {type:"integer"}
language = 'English' #@param ['any', 'English']
model_size = 'medium' #@param ['tiny', 'base', 'small', 'medium', 'large']

model_name = model_size
if language == 'English' and model_size != 'large':
  model_name += '.en'

model = whisper.load_model(model_size)


def transcribe(yt_id, entity="", model=model):
    #TODO transcribing and saving should be two different functions (single concern)
    path = f'{output_path}/{yt_id}.mp3'
    result = model.transcribe(path)
    result['video_id'] = yt_id
    result["entity"] = entity
    return result

100%|█████████████████████████████████████| 1.42G/1.42G [00:46<00:00, 33.2MiB/s]


In [None]:
for filename in os.listdir(output_path):
    if filename.endswith(".mp3"):
        id = filename.split(".")[:-1][0]
        print(id)
        out = f'{output_path}/{id}_whisper.json'

    if exists(out):
        print(out, " -- already exists")
        continue

    #read meta data to add to whisper file (to later connect files)

    metafile = f'{output_path}/{id}_meta.json'
    print(metafile)
    
    meta = {}
    try:
        with open(metafile) as f:
            meta = json.loads(f.read())
    except:
        print("Error with ", metafile, "... moving on")
        continue
    
    entity = meta["entity"] #person/podcast etcs
    title = meta["title"]

    #Transcribe audio
    print(f'TRANSCRIBING {round(meta["length"]/60)}min : {title} : {entity} -- {id}')
    result = transcribe(id, entity)

    with open(out, 'w') as f:
        json.dump(result, f)

            

WqIzUopTPvU
/content/drive/MyDrive/YT//WqIzUopTPvU_whisper.json  -- already exists
/content/drive/MyDrive/YT//WqIzUopTPvU_whisper.json  -- already exists
VMj-3S1tku0
/content/drive/MyDrive/YT//VMj-3S1tku0_whisper.json  -- already exists
PaCmpygFfXo
/content/drive/MyDrive/YT//PaCmpygFfXo_whisper.json  -- already exists
TCH_1BHY58I
/content/drive/MyDrive/YT//TCH_1BHY58I_whisper.json  -- already exists
P6sfmUTpUmc
/content/drive/MyDrive/YT//P6sfmUTpUmc_whisper.json  -- already exists
q8SA3rM6ckI
/content/drive/MyDrive/YT//q8SA3rM6ckI_whisper.json  -- already exists
3SypMvnQT_s
/content/drive/MyDrive/YT//3SypMvnQT_s_whisper.json  -- already exists
/content/drive/MyDrive/YT//3SypMvnQT_s_whisper.json  -- already exists
a510m7s_SVI
/content/drive/MyDrive/YT//a510m7s_SVI_whisper.json  -- already exists
/content/drive/MyDrive/YT//a510m7s_SVI_whisper.json  -- already exists
/content/drive/MyDrive/YT//a510m7s_SVI_whisper.json  -- already exists
/content/drive/MyDrive/YT//a510m7s_SVI_whisper.json 

