# Transcribe a YouTube Video in English with Assembly AI and yt-dlp

### References:

- [Assembly AI documentation](https://www.assemblyai.com/docs)
- [yt-dlp on GitHub](https://github.com/yt-dlp/yt-dlp)

## Preparation

#### Imports and Globals

In [1]:
import assemblyai as aai
import yt_dlp
import json

from config import *

aai.settings.api_key = aai_key
YT_BASE_URL = 'https://www.youtube.com/watch?v='
DST_FOLDER = 'files'

#### Task-specific Variables

In [None]:
v_id = '08j09G2CGVc'  # the main identifier of the video, absolutely needeed (or is it?)

#### Pull and save the soundtrack with yt-dlp

In [4]:
# url = f'{YT_BASE_URL}{v_id}'
# url = 'https://www.youtube.com/watch?v=4cr3KD0ayOE'
url = input('Enter the URL of the video: ')

ydl_opts = {
    'format': 'm4a/bestaudio/best',  # The best audio version in m4a format
    'outtmpl': f'{DST_FOLDER}/%(title)s_%(id)s.%(ext)s',  
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(url)
    audio_file = ydl.prepare_filename(info)

print(f'\n>>> Downloaded to: {audio_file}')

[youtube] Extracting URL: https://www.youtube.com/watch?v=39LG-46p9qM
[youtube] 39LG-46p9qM: Downloading webpage
[youtube] 39LG-46p9qM: Downloading tv client config
[youtube] 39LG-46p9qM: Downloading player b21600d5
[youtube] 39LG-46p9qM: Downloading tv player API JSON
[youtube] 39LG-46p9qM: Downloading ios player API JSON
[youtube] 39LG-46p9qM: Downloading m3u8 information
[info] 39LG-46p9qM: Downloading 1 format(s): 140
[download] Destination: files\The Story Behind the Purim Story_39LG-46p9qM.m4a
[download] 100% of   41.28MiB in 00:01:10 at 603.90KiB/s    
[FixupM4a] Correcting container of "files\The Story Behind the Purim Story_39LG-46p9qM.m4a"

>>> Downloaded to: files\The Story Behind the Purim Story_39LG-46p9qM.m4a


## Building the transcript via AssemblyAI API

In [2]:
def get_file_name(audio_file: str):
    return audio_file.split("\\")[-1].split(".")[0]

In [3]:
transcriber = aai.Transcriber()

In [7]:
# no speaker differentiation
config = aai.TranscriptionConfig(language_detection=True)
if 'audio_file' not in vars().keys():
    audio_file = input('Path to audio: ')
transcript = transcriber.transcribe(audio_file, config)
print(transcript.status)

with open(f'files/transcript_{get_file_name(audio_file)}_{transcript.id}.txt', 'w', encoding='utf-8') as f:
    f.write(transcript.text)

TranscriptStatus.completed


In [7]:
# with speaker differentiation
config = aai.TranscriptionConfig(
    speaker_labels=True
)
transcript = transcriber.transcribe(audio_file, config)
print(transcript.status, transcript.id)

with open(f'files/transcript_{get_file_name(audio_file)}_{transcript.id}.txt', 'w', encoding='utf-8') as f:
    for utterance in transcript.utterances:
        f.write(f'{utterance.speaker}: {utterance.text}\n')

TranscriptStatus.completed 1212fad6-5dbf-41b7-95bb-2c8b32987e8c


In [8]:
json.dump(transcript.json_response, 
          open(f'files/transcript_{get_file_name(audio_file)}_{transcript.id}.json', 'w', encoding='utf-8'), 
          indent=4, 
          ensure_ascii=False
)

In [9]:
for utterance in transcript.utterances:
  print(f"Speaker {utterance.speaker}: {utterance.text}")

Speaker A: Good morning and welcome to World Wide Wisdom. The story of Purim is actually a story of modern times. You're not talking about any overt miracles. You know what's deafening about Purim story is that the silence is deafening when it comes to the miracle. Like, what was the miracle of the Purim story? There wasn't any. There was no miracle that was explicit and overt in the Purim story. Because. Because Purim is just an unfolding of events. It's a. An unlikely palace intrigue where it just so happened to be this way, just happened to be that way, and something else. And the queen was there at the right time, at the right place. Bim bam. The Jews were saved. Purim is the hidden hand of God as opposed to other miracles which were explicit, clear and unmistakable. Now, if you've. Most people are familiar with the Purim story as the following executive summary, where Haman tried to commit genocide against the Jews. Queen Esther was in the palace, right? Mordechai had something to

## Working with the transcript

In [2]:
# load the transcript by id if necessary
job_id = '1212fad6-5dbf-41b7-95bb-2c8b32987e8c'

transcript = aai.Transcript.get_by_id(job_id)
transcript.id

'1212fad6-5dbf-41b7-95bb-2c8b32987e8c'

### Search for words

In [9]:
def convert_millis(millis):
    seconds = millis // 1000
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:02}"


def find_words(query):
    matches = transcript.word_search(query.split())
    for match in matches:
        print(f'{match.text}: ', end='')
        print('; '.join([convert_millis(start) for start, end in match.timestamps]))


def find_sequence(query):
    starts = [] 
    ordered_words = [word.lower().strip() for word in query.split()]
    matches = transcript.word_search(ordered_words)
    match0 = None
    for i, match in enumerate(matches):
        if match.text == ordered_words[0]:
            match0 = matches.pop(i)
            break
    if match0 is None:
        print(f'No match for "{query}"')
        return
    for index, timestamp in zip(match0.indexes, match0.timestamps):
        for match in matches:
            if index + 1 in match.indexes:
                starts.append(timestamp[0])
    for i, word in enumerate(transcript.words):
        if word.start in starts:
            print(f'{convert_millis(word.start)} {word.text} {transcript.words[i+1].text}')
    return

In [12]:
query = 'Apropos'
find_words(query)

apropos: 00:17:44


In [15]:
query = 'Jewish homeland'
find_sequence(query)

00:30:03 Jewish homeland


In [7]:
# query = 'overrule God'
matches = transcript.word_search(query.split())
matches

[WordSearchMatch(text='3319', count=4, timestamps=[(547608, 549260), (567468, 568156), (746346, 747106), (1399256, 1399808)], indexes=[1582, 1641, 2144, 4284])]

In [20]:
transcript.words

[Word(text='Good', start=1440, end=1552, confidence=0.99349, speaker=None, channel=None),
 Word(text='morning', start=1552, end=1736, confidence=0.99996, speaker=None, channel=None),
 Word(text='and', start=1760, end=1912, confidence=0.99584, speaker=None, channel=None),
 Word(text='welcome', start=1936, end=2136, confidence=0.67908, speaker=None, channel=None),
 Word(text='to', start=2168, end=2312, confidence=0.99951, speaker=None, channel=None),
 Word(text='Worldwide', start=2336, end=2776, confidence=0.54569, speaker=None, channel=None),
 Word(text='Wisdom.', start=2808, end=3192, confidence=0.98676, speaker=None, channel=None),
 Word(text='Folks.', start=3256, end=3544, confidence=0.99498, speaker=None, channel=None),
 Word(text='Today', start=3592, end=3800, confidence=0.99831, speaker=None, channel=None),
 Word(text="we're", start=3840, end=4008, confidence=0.97077, speaker=None, channel=None),
 Word(text='learning', start=4024, end=4264, confidence=0.99088, speaker=None, channe