In [2]:
%pip install openai-whisper
%pip install yt-dlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai-whisper
  Downloading openai-whisper-20230314.tar.gz (792 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.9/792.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken==0.3.1 (from openai-whisper)
  Downloading tiktoken-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python==0.2.0 (from openai-whisper)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created 

In [3]:
from datetime import timedelta
import csv
from IPython.display import FileLink
import whisper
from yt_dlp import YoutubeDL
import warnings

In [4]:
# Settings for the YouTube downloader
YDL_OPTS = {
    "format": "bestaudio/best",
    "postprocessors": [{
        "key": "FFmpegExtractAudio",
        "preferredcodec": "mp3",
        "preferredquality": "192",
    }],
    "postprocessor_args": [
        "-ar", "16000"  # Set audio sampling rate to 16kHz for whisper model
    ],
    "noplaylist": True,
}

In [5]:
def download_audio(filename: str, url: str):
    YDL_OPTS["outtmpl"] = filename

    with YoutubeDL(YDL_OPTS) as ydl:
        ydl.download([url])

In [6]:
def transcribe_audio(audio_path: str):
    model = whisper.load_model("base")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")  # Filter out the FP16 warning
        result = model.transcribe(audio_path)
    return result

In [7]:
def save_transcriptions(result, output_file):
    with open(output_file, "w") as file:
        w = csv.writer(file)
        w.writerow(["Start Time", "End Time", "Text"])
        for seg in result.get("segments", []):
            start = timedelta(seconds=seg["start"])
            end = timedelta(seconds=seg["end"])
            text = seg["text"].lstrip()
            row = [start, end, text]
            w.writerow(row)

In [8]:
# Prompt the user to upload the audio file
from google.colab import files
uploaded = files.upload()

# Get the filename of the uploaded file
uploaded_filename = next(iter(uploaded))

Saving Shape of You.mp3 to Shape of You.mp3


In [9]:
# Download audio from YouTube
audio_filename = f"/content/{uploaded_filename}"
video_url = "https://www.youtube.com/watch?v=z6xslDMimME"
download_audio(audio_filename, video_url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=z6xslDMimME
[youtube] z6xslDMimME: Downloading webpage
[youtube] z6xslDMimME: Downloading ios player API JSON
[youtube] z6xslDMimME: Downloading android player API JSON
[youtube] z6xslDMimME: Downloading player b7910ca8
[youtube] z6xslDMimME: Downloading m3u8 information
[info] z6xslDMimME: Downloading 1 format(s): 251
[download] /content/Shape of You.mp3 has already been downloaded
[download] 100% of   10.25MiB
[ExtractAudio] Not converting audio /content/Shape of You.mp3; file is already in target format mp3


In [10]:
# Transcribe the audio
transcription_result = transcribe_audio(audio_filename)

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 107MiB/s]


In [11]:
# Save transcriptions to CSV file
output_csv_file = "/content/output.csv"
save_transcriptions(transcription_result, output_csv_file)

In [12]:
# Display the download link
FileLink(output_csv_file)