## Set the youtube URL

In [1]:
YT_URL="https://www.youtube.com/watch?v=Jsz4E2iNXUA"
EMBED_URL="https://www.youtube.com/embed/Jsz4E2iNXUA"

## Download video

In [2]:
!pip install pytube



In [3]:
from pathlib import Path

In [4]:
from pytube import YouTube

In [5]:
def fetch_youtube_audio(url: str, fetch_path: Path) -> Path:
    ext = 'mp4'
    order = 'abr'
    
    yt = YouTube(url)
    yt.check_availability()
    
    filename = f"{yt.video_id}.{ext}"
    download_path = fetch_path/filename
    
    audio_streams = yt.streams.filter(only_audio=True, file_extension=ext).order_by(order).desc()
    # download it
    audio_streams.first().download(filename=download_path, skip_existing=True)
    
    return download_path

In [6]:
audio_files_path = Path('./audio_files')
audio_files_path.mkdir(exist_ok=True)

In [7]:
downloaded_audio_file = fetch_youtube_audio(YT_URL, audio_files_path)
downloaded_audio_file

PosixPath('audio_files/Jsz4E2iNXUA.mp4')

## Transcribe with Whisper

In [8]:
# apt update && apt install git ffmpeg --yes

In [9]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-cu6ip6ei
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-cu6ip6ei
  Resolved https://github.com/openai/whisper.git to commit deafef05f33179b8dd865893eb4705b513f906dc
  Preparing metadata (setup.py) ... [?25ldone


In [10]:
import os
import csv

In [11]:
import torch
import whisper
from whisper.utils import write_vtt

In [12]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
whisper.available_models()

['tiny.en',
 'tiny',
 'base.en',
 'base',
 'small.en',
 'small',
 'medium.en',
 'medium',
 'large']

In [14]:
WHISPER_HOME = os.environ['WHISPER_HOME']

In [15]:
WHISPER_MODEL_NAME = 'base'

In [16]:
WHISPER_MODEL = whisper.load_model(WHISPER_MODEL_NAME, device=torch_device, download_root=WHISPER_HOME)

In [17]:
def transcribe_audio_to_vtt(audio_path: Path, transcribe_path: Path, model=WHISPER_MODEL) -> Path:
    stem = audio_path.stem
    ext = 'vtt'
    
    filename = f"{audio_path.stem}.{ext}"
    vtt_path = transcribe_path/filename

    fields = ['start', 'end', 'text']
    
    result = model.transcribe(str(audio_path))
    segments = result['segments']
    
    with open(vtt_path, "w", encoding="utf-8") as vtt:
            write_vtt(segments, file=vtt)

    return vtt_path

In [18]:
transcription_files_path = Path('./transcription_files')
transcription_files_path.mkdir(exist_ok=True)

In [19]:
transcription_file = transcribe_audio_to_vtt(downloaded_audio_file, transcription_files_path)
transcription_file

PosixPath('transcription_files/Jsz4E2iNXUA.vtt')

## Render the transcription

In [20]:
!pip install webvtt-py



In [21]:
import webvtt

In [22]:
from IPython.display import IFrame

In [23]:
IFrame(EMBED_URL, width=800, height=450)

In [24]:
for caption in webvtt.read(transcription_file)[160:200]:
    print(caption.start, caption.end, caption.text)

00:10:23.360 00:10:27.760  There's another dataset that's presented in the course, which is interesting because
00:10:27.760 00:10:33.800  it actually just uses Amazon reviews and it uses the title of the review as the summary
00:10:33.800 00:10:41.120  you want to predict and the body of the review as the context.
00:10:41.120 00:10:47.000  And definitely one thing to know is if you're actually building a summarization model, you
00:10:47.000 00:10:52.320  really want to understand your particular dataset.
00:10:52.320 00:10:56.240  And especially you want to look at the length of the summaries.
00:10:56.240 00:11:01.440  So the CNN Daily Mail, like the summaries are multiple highlights.
00:11:01.440 00:11:05.200  So like there are like one sentence highlights all concatenated together.
00:11:05.200 00:11:12.560  But if you're working with the dataset where the summaries are just like themes or very
00:11:12.560 00:11:18.240  succinct like short summaries, you may have to do more trai