# Transcribe a YouTube Video in English with Assembly AI and yt-dlp

### References:

- [Assembly AI documentation](https://www.assemblyai.com/docs)
- [yt-dlp on GitHub](https://github.com/yt-dlp/yt-dlp)

## Preparation

#### Imports and Globals

In [4]:
import os
import sys
import assemblyai as aai
import yt_dlp
import json

project_root = os.path.abspath("..")  # Assuming the notebook is in a subfolder
sys.path.append(project_root)

from config import *

aai.settings.api_key = aai_key
YT_BASE_URL = 'https://www.youtube.com/watch?v='
DST_FOLDER = 'files'

#### Task-specific Variables

In [None]:
v_id = '08j09G2CGVc'  # the main identifier of the video, absolutely needeed (or is it?)

#### Pull and save the soundtrack with yt-dlp

In [2]:
# url = f'{YT_BASE_URL}{v_id}'
# url = 'https://www.youtube.com/watch?v=4cr3KD0ayOE'
url = input('Enter the URL of the video: ')

ydl_opts = {
    'format': 'm4a/bestaudio/best',  # The best audio version in m4a format
    'outtmpl': f'{DST_FOLDER}/%(title)s_%(id)s.%(ext)s',  
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(url)
    audio_file = ydl.prepare_filename(info)

print(f'\n>>> Downloaded to: {audio_file}')

[youtube] Extracting URL: https://www.youtube.com/watch?v=mslulWUhY44
[youtube] mslulWUhY44: Downloading webpage
[youtube] mslulWUhY44: Downloading tv client config
[youtube] mslulWUhY44: Downloading player 7d1d50a6
[youtube] mslulWUhY44: Downloading tv player API JSON
[youtube] mslulWUhY44: Downloading ios player API JSON
[youtube] mslulWUhY44: Downloading m3u8 information




[info] mslulWUhY44: Downloading 1 format(s): 140
[download] files\Rabbi： ＂It's Time To Tell You The ENTIRE Truth About Idolatry...＂_mslulWUhY44.m4a has already been downloaded
[download] 100% of   36.74MiB

>>> Downloaded to: files\Rabbi： ＂It's Time To Tell You The ENTIRE Truth About Idolatry...＂_mslulWUhY44.m4a


## Building the transcript via AssemblyAI API

In [3]:
def get_file_name(audio_file: str):
    return audio_file.split("\\")[-1].split(".")[0]

In [4]:
transcriber = aai.Transcriber()

In [None]:
# no speaker differentiation
config = aai.TranscriptionConfig(language_detection=True)
if 'audio_file' not in vars().keys():
    audio_file = input('Path to audio: ')
transcript = transcriber.transcribe(audio_file.strip('"'), config)
print(transcript.status, transcript.id)

with open(f'files/transcript_{get_file_name(audio_file)}_{transcript.id}.txt', 'w', encoding='utf-8') as f:
    f.write(transcript.text)

In [None]:
# with speaker differentiation
config = aai.TranscriptionConfig(
    speaker_labels=True
)
transcript = transcriber.transcribe(audio_file, config)
print(transcript.status, transcript.id)

with open(f'files/transcript_{get_file_name(audio_file)}_{transcript.id}.txt', 'w', encoding='utf-8') as f:
    for utterance in transcript.utterances:
        f.write(f'Speaker_{utterance.speaker}: {utterance.text}\n')

TranscriptStatus.completed 1212fad6-5dbf-41b7-95bb-2c8b32987e8c


In [8]:
json.dump(transcript.json_response, 
          open(f'files/transcript_{get_file_name(audio_file)}_{transcript.id}.json', 'w', encoding='utf-8'), 
          indent=4, 
          ensure_ascii=False
)

In [None]:
for utterance in transcript.utterances:
  print(f"Speaker {utterance.speaker}: {utterance.text}")

## Working with the transcript

In [8]:
# load the transcript by id if necessary
job_id = 'c2cc95ab-9bf5-4cd8-a1b5-33fbb75a2502'

transcript = aai.Transcript.get_by_id(job_id)
transcript.id

'c2cc95ab-9bf5-4cd8-a1b5-33fbb75a2502'

In [9]:
transcript.export_subtitles_vtt()

"WEBVTT\n\n00:02.080 --> 00:06.232\nHello everyone. Welcome to Stream. I'm CCPP and you're tuning into the New Eden\n\n00:06.296 --> 00:09.560\nReview Dev show. Totally not picked for the acronym\n\n00:09.640 --> 00:13.288\nnerds, Am I right, Swift? Yep, that's exactly right. Because we're\n\n00:13.304 --> 00:15.784\njust a bunch of nerds. We are a bunch of nerds. We're nerds.\n\n00:15.912 --> 00:18.872\nAbsolutely. This is a new show we've been wanting to do for a while with\n\n00:18.896 --> 00:22.712\na kind of laid back, kind of podcasty style. And today we're going to go\n\n00:22.736 --> 00:26.440\nover patch notes. We're going to talk about how we feel about everything\n\n00:26.480 --> 00:29.462\nthat's come out so far. We might use this show in the future to go\n\n00:29.486 --> 00:32.294\nover things like Community beats and other things, but we don't want to stick to\n\n00:32.302 --> 00:36.118\na schedule or like commit to one at the moment. Today though, we have\n\n00:36.174 --

In [13]:
with open(f'../files/transcript_{transcript.id}.txt', 'w', encoding='utf-8') as f:
    for utterance in transcript.utterances:
        f.write(f'Speaker_{utterance.speaker}: {utterance.text}\n')

In [11]:
transcript.summary

### Search for words

In [3]:
def convert_millis(millis):
    seconds = millis // 1000
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:02}"


def find_words(query):
    matches = transcript.word_search(query.split())
    for match in matches:
        print(f'{match.text}: ', end='')
        print('; '.join([convert_millis(start) for start, end in match.timestamps]))


def find_sequence(query):
    starts = [] 
    ordered_words = [word.lower().strip() for word in query.split()]
    matches = transcript.word_search(ordered_words)
    match0 = None
    for i, match in enumerate(matches):
        if match.text == ordered_words[0]:
            match0 = matches.pop(i)
            break
    if match0 is None:
        print(f'No match for "{query}"')
        return
    for index, timestamp in zip(match0.indexes, match0.timestamps):
        for match in matches:
            if index + 1 in match.indexes:
                starts.append(timestamp[0])
    for i, word in enumerate(transcript.words):
        if word.start in starts:
            print(f'{convert_millis(word.start)} {word.text} {transcript.words[i+1].text}')
    return

In [5]:
query = 'forevermore'
find_words(query)

forevermore: 00:45:28


In [4]:
query = 'high court'
find_sequence(query)

00:39:07 high court.
00:41:17 high court.


In [None]:
# query = 'daily offerings'
matches = transcript.word_search(query.split())
matches

[WordSearchMatch(text='now', count=25, timestamps=[(124572, 125004), (375630, 376134), (801950, 802262), (862950, 863310), (1016428, 1017040), (1020372, 1020532), (1427982, 1428478), (1458080, 1458824), (1463208, 1463480), (1465516, 1465836), (1484740, 1485484), (1577618, 1578270), (1704580, 1705212), (2168414, 2168918), (2212946, 2213510), (2276080, 2276440), (2398940, 2399188), (2633236, 2633412), (2705630, 2706226), (2712850, 2713114), (2810930, 2811670), (2843756, 2843980), (2846348, 2846920), (2881790, 2882246), (2908290, 2909030)], indexes=[239, 808, 1899, 2083, 2518, 2525, 3650, 3750, 3762, 3770, 3801, 3984, 4305, 5494, 5622, 5787, 6123, 6748, 6965, 6973, 7240, 7324, 7335, 7431, 7496]),
 WordSearchMatch(text='revert', count=2, timestamps=[(1462624, 1463096), (1465156, 1465452)], indexes=[3761, 3769])]

In [20]:
transcript.words

[Word(text='Good', start=1440, end=1552, confidence=0.99349, speaker=None, channel=None),
 Word(text='morning', start=1552, end=1736, confidence=0.99996, speaker=None, channel=None),
 Word(text='and', start=1760, end=1912, confidence=0.99584, speaker=None, channel=None),
 Word(text='welcome', start=1936, end=2136, confidence=0.67908, speaker=None, channel=None),
 Word(text='to', start=2168, end=2312, confidence=0.99951, speaker=None, channel=None),
 Word(text='Worldwide', start=2336, end=2776, confidence=0.54569, speaker=None, channel=None),
 Word(text='Wisdom.', start=2808, end=3192, confidence=0.98676, speaker=None, channel=None),
 Word(text='Folks.', start=3256, end=3544, confidence=0.99498, speaker=None, channel=None),
 Word(text='Today', start=3592, end=3800, confidence=0.99831, speaker=None, channel=None),
 Word(text="we're", start=3840, end=4008, confidence=0.97077, speaker=None, channel=None),
 Word(text='learning', start=4024, end=4264, confidence=0.99088, speaker=None, channe