In [None]:
# |default_exp transcription
# |default_cls_lvl 3

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# transcription

Classes and methods for transcribing video/audio into text

In [None]:
# |export
from __future__ import annotations

import csv, os
from pathlib import Path

import pandas as pd
from pytube import YouTube
import torch
import webvtt
import whisper
from whisper.utils import write_vtt

from course_copilot import utils
from course_copilot.preprocessing import convert_duration_to_seconds

In [None]:
# | hide
import pdb

from fastcore.test import *
import nbdev

from blurr.utils import print_versions

## Video download

In [None]:
# | export
def fetch_youtube_audio(yt_id: str, audio_files_fpath: Path = Path("./transcription/audio_files")) -> Path:
    ext = "mp4"
    order = "abr"

    yt = YouTube(f"https://www.youtube.com/watch?v={yt_id}")
    yt.check_availability()

    filename = f"{yt.video_id}.{ext}"

    audio_files_fpath.mkdir(exist_ok=True, parents=True)
    download_path = audio_files_fpath / filename

    audio_streams = yt.streams.filter(only_audio=True, file_extension=ext).order_by(order).desc()

    # download it
    audio_streams.first().download(filename=download_path, skip_existing=True)
    return download_path

In [None]:
youtube_id = "Jsz4E2iNXUA"

audio_fpath = fetch_youtube_audio(youtube_id, Path("../transcription/audio_files"))
audio_fpath

Path('../transcription/audio_files/Jsz4E2iNXUA.mp4')

## Whisper transcription

In [None]:
# | export
def fetch_transcription(
    audio_fpath: Path,
    transcription_fpath: Path = Path("./transcription/transcriptions"),
    model_fpath: Path = Path("./transcription/models"),
    model_checkptoint: str = "base",
    device="cpu",
):

    transcription_fpath.mkdir(exist_ok=True, parents=True)
    model_fpath.mkdir(exist_ok=True, parents=True)

    torch_device = device if torch.cuda.is_available() and device != "cpu" else "cpu"
    model = whisper.load_model(model_checkptoint, device=torch_device, download_root=model_fpath)

    stem = audio_fpath.stem
    ext = "vtt"

    filename = f"{audio_fpath.stem}.{ext}"
    vtt_path = transcription_fpath / filename

    fields = ["start", "end", "text"]

    result = model.transcribe(str(audio_fpath))
    segments = result["segments"]

    with open(vtt_path, "w", encoding="utf-8") as vtt:
        write_vtt(segments, file=vtt)

    return vtt_path

In [None]:
transcription_fpath = fetch_transcription(
    audio_fpath, Path("../transcription/transcriptions"), Path("../transcription/models"), device="cuda"
)
transcription_fpath

Path('../transcription/transcriptions/Jsz4E2iNXUA.vtt')

## Transcription rendering

In [None]:
# | export
def transcription_to_df(transcription_fpath):
    transcription_d = []
    for caption in webvtt.read(transcription_fpath):
        transcription_d.append({"timestamp": caption.start, "transcript": caption.text})

    df = pd.DataFrame(transcription_d)
    df["timestamp"] = df["timestamp"].astype(str)
    df.insert(0, "elapsed_seconds", df["timestamp"].apply(convert_duration_to_seconds))

    return df

In [None]:
df = transcription_to_df(transcription_fpath)
df.head()

Unnamed: 0,elapsed_seconds,timestamp,transcript
0,0.0,00:00:00.000,Let me make sure everything is as it should be.
1,3.56,00:00:03.560,I'm always fascinated by the fact that people are waiting.
2,6.72,00:00:06.720,It's like it's it's so surprising that people are like on here
3,12.7,00:00:12.700,sometimes early for you.
4,14.12,00:00:14.120,It makes sense for my session.


## Export -

In [None]:
# | hide
nbdev.nbdev_export()