In [45]:
import os
import json
from youtube_transcript_api import YouTubeTranscriptApi


# Expand the ~ to the user's home directory
file_path = os.path.expanduser(
    "../data/external/transcript.json"
)
video_id = "Kbk9BiPhm7o"
podcast_name = "Elon Musk: Neuralink and the Future of Humanity | Lex Fridman Podcast"

# Create directories if they don't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)

In [53]:
def get_transcript(video_id):
    try:
        # Retrieve the transcript for the given video ID
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"Error retrieving transcript: {e}")
        return None

transcript = get_transcript(video_id)


def convert_time(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"


def process_transcript(data, podcast_name: str):
    merged_data = []
    batch_size = 20

    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        merged_dict = {
            "podcast_name": podcast_name,
            "text": " ".join([d["text"] for d in batch]),
            "start": convert_time(int(batch[0]["start"])),
        }
        merged_data.append(merged_dict)

    return merged_data

In [55]:
transcript = get_transcript(video_id)
modified_transcript = process_transcript(transcript, podcast_name)

In [48]:
len(modified_transcript)

649

In [56]:
# Write to the file
with open(file_path, "w") as f:
    json.dump(modified_transcript, f, indent=4, ensure_ascii=False)


In [52]:
!head ../data/external/transcript.json

[
    {
        "text": "the following is a conversation with Elon Musk DJ sa Matthew McDougall Bliss Chapman and Nolan arbaugh about neuralink and the future of humanity Elon DJ Matthew and Bliss are of course part of the amazing neuralink team and Nolan is the first human to have a neuralink device implanted in his brain I speak with each of them individually so use time stamps to jump around or as I recommend go hardcore and listen to the whole thing this is the longest podcast I've ever done it's a fascinating super technical and wide- ranging conversation and I loved every minute of it and now dear friends here's Elon Musk his fifth time on this The Lex fredman podcast drinking coffee or water water I'm so over caffeinated right now",
        "start": "00:00:00"
    },
    {
        "text": "do you want some caffeine I mean sure there's a there a Nitro drink this supposed to keep you up till like you know tomorrow afternoon basically yeah I don't so what does Nitro it's just got a