## Set the youtube URL

In [1]:
YT_ID = "Jsz4E2iNXUA"

In [2]:
YT_URL = f"https://www.youtube.com/watch?v={YT_ID}"
EMBED_URL = f"https://www.youtube.com/embed/{YT_ID}"

## Download video

In [3]:
from pathlib import Path

In [4]:
from pytube import YouTube

In [5]:
def fetch_youtube_audio(url: str, fetch_path: Path) -> Path:
    ext = "mp4"
    order = "abr"

    yt = YouTube(url)
    yt.check_availability()

    filename = f"{yt.video_id}.{ext}"
    download_path = fetch_path / filename

    audio_streams = yt.streams.filter(only_audio=True, file_extension=ext).order_by(order).desc()
    # download it
    audio_streams.first().download(filename=download_path, skip_existing=True)

    return download_path

In [6]:
audio_files_path = Path("./audio_files")
audio_files_path.mkdir(exist_ok=True)

In [7]:
downloaded_audio_file = fetch_youtube_audio(YT_URL, audio_files_path)
downloaded_audio_file

PosixPath('audio_files/Jsz4E2iNXUA.mp4')

## Transcribe with Whisper

In [9]:
# apt update && apt install git ffmpeg --yes

In [10]:
import os
import csv

In [11]:
import torch
import whisper
from whisper.utils import write_vtt

In [12]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
whisper.available_models()

['tiny.en',
 'tiny',
 'base.en',
 'base',
 'small.en',
 'small',
 'medium.en',
 'medium',
 'large']

In [15]:
WHISPER_HOME = "../../models"

In [16]:
WHISPER_MODEL_NAME = "base"

In [17]:
WHISPER_MODEL = whisper.load_model(WHISPER_MODEL_NAME, device=torch_device, download_root=WHISPER_HOME)

100%|███████████████████████████████████████| 139M/139M [00:11<00:00, 12.5MiB/s]


In [18]:
def transcribe_audio_to_vtt(audio_path: Path, transcribe_path: Path, model=WHISPER_MODEL) -> Path:
    stem = audio_path.stem
    ext = "vtt"

    filename = f"{audio_path.stem}.{ext}"
    vtt_path = transcribe_path / filename

    fields = ["start", "end", "text"]

    result = model.transcribe(str(audio_path))
    segments = result["segments"]

    with open(vtt_path, "w", encoding="utf-8") as vtt:
        write_vtt(segments, file=vtt)

    return vtt_path

In [19]:
transcription_files_path = Path("./transcription_files")
transcription_files_path.mkdir(exist_ok=True)

In [21]:
transcription_file = transcribe_audio_to_vtt(downloaded_audio_file, transcription_files_path)
transcription_file

PosixPath('transcription_files/Jsz4E2iNXUA.vtt')

## Render the transcription

In [22]:
import webvtt

In [23]:
from IPython.display import IFrame

In [24]:
IFrame(EMBED_URL, width=800, height=450)

In [25]:
for caption in webvtt.read(transcription_file)[160:200]:
    print(caption.start, caption.end, caption.text)

00:09:01.400 00:09:04.520 and translation tasks for more information on maybe
00:09:04.520 00:09:08.560 data sets that you want to play with.
00:09:08.560 00:09:12.640 So let's go ahead and start looking at how
00:09:12.640 00:09:16.800 do we actually build a summarization or a translation model?
00:09:16.800 00:09:18.640 And how do we use it?
00:09:18.640 00:09:22.360 And as usual, we start with asking the question, which
00:09:22.360 00:09:25.920 is what's a good data set to look at.
00:09:25.920 00:09:27.960 And in particular, when we're doing this,
00:09:27.960 00:09:31.640 we're doing a text to text task.
00:09:31.640 00:09:34.520 And so we want to look for data sets
00:09:34.520 00:09:37.240 where we have our inputs as a text
00:09:37.240 00:09:42.600 and our labels as the text that we want to generate or predict.
00:09:42.600 00:09:47.040 And this is actually, I think the sequence of sequence bits
00:09:47.040 00:09:49.280 in particular summarization are the things
00:09:49.280 