In [1]:
import os
import json
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import is_torch_tpu_available

# Expand the ~ to the user's home directory
file_path = os.path.expanduser(
    "../data/external/transcript.json"
)
video_id = "Kbk9BiPhm7o"
podcast_name = "Elon Musk: Neuralink and the Future of Humanity | Lex Fridman Podcast"

# Create directories if they don't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)

In [2]:
def get_transcript(video_id):
    try:
        # Retrieve the transcript for the given video ID
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        print(f"Error retrieving transcript: {e}")
        return None

transcript = get_transcript(video_id)


def convert_time(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"


def process_transcript(data, podcast_name: str):
    merged_data = []
    batch_size = 20

    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        merged_dict = {
            "podcast_name": podcast_name,
            "text": " ".join([d["text"] for d in batch]),
            "start": convert_time(int(batch[0]["start"])),
        }
        merged_data.append(merged_dict)

    return merged_data

In [3]:
%pip install --upgrade --quiet  youtube-transcript-api

Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install --upgrade langchain



In [31]:
from langchain_community.document_loaders.youtube import TranscriptFormat
from langchain_community.document_loaders import YoutubeLoader

loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=Kbk9BiPhm7o",
    add_video_info=True,
    transcript_format=TranscriptFormat.CHUNKS,
    chunk_size_seconds=30,
)
# print("\n\n".join(map(repr, loader.load())))
print(type((map(repr, loader.load()))))
map_obj = map(repr, loader.load())
map_list = list(map_obj)
print(list(map_list)[0])

<class 'map'>
Document(metadata={'source': 'https://www.youtube.com/watch?v=Kbk9BiPhm7o&t=0s', 'title': 'Elon Musk: Neuralink and the Future of Humanity | Lex Fridman Podcast #438', 'description': 'Unknown', 'view_count': 3396277, 'thumbnail_url': 'https://i.ytimg.com/vi/Kbk9BiPhm7o/hq720.jpg', 'publish_date': '2024-08-02 00:00:00', 'length': 31054, 'author': 'Lex Fridman Podcast', 'start_seconds': 0, 'start_timestamp': '00:00:00'}, page_content='the following is a conversation with Elon Musk DJ sa Matthew McDougall Bliss Chapman and Nolan arbaugh about neuralink and the future of humanity Elon DJ Matthew and Bliss are of course part of the amazing neuralink team and Nolan is the first human to have a neuralink device implanted in his brain I speak with each of them individually so use time stamps to jump')


In [34]:
doc = loader.load()[0]
print(str(doc))
print()
print(repr(doc))

page_content='the following is a conversation with Elon Musk DJ sa Matthew McDougall Bliss Chapman and Nolan arbaugh about neuralink and the future of humanity Elon DJ Matthew and Bliss are of course part of the amazing neuralink team and Nolan is the first human to have a neuralink device implanted in his brain I speak with each of them individually so use time stamps to jump' metadata={'source': 'https://www.youtube.com/watch?v=Kbk9BiPhm7o&t=0s', 'title': 'Elon Musk: Neuralink and the Future of Humanity | Lex Fridman Podcast #438', 'description': 'Unknown', 'view_count': 3396789, 'thumbnail_url': 'https://i.ytimg.com/vi/Kbk9BiPhm7o/hq720.jpg', 'publish_date': '2024-08-02 00:00:00', 'length': 31054, 'author': 'Lex Fridman Podcast', 'start_seconds': 0, 'start_timestamp': '00:00:00'}

Document(metadata={'source': 'https://www.youtube.com/watch?v=Kbk9BiPhm7o&t=0s', 'title': 'Elon Musk: Neuralink and the Future of Humanity | Lex Fridman Podcast #438', 'description': 'Unknown', 'view_count

In [32]:

print(list(map_list)[10])

Document(metadata={'source': 'https://www.youtube.com/watch?v=Kbk9BiPhm7o&t=300s', 'title': 'Elon Musk: Neuralink and the Future of Humanity | Lex Fridman Podcast #438', 'description': 'Unknown', 'view_count': 3396277, 'thumbnail_url': 'https://i.ytimg.com/vi/Kbk9BiPhm7o/hq720.jpg', 'publish_date': '2024-08-02 00:00:00', 'length': 31054, 'author': 'Lex Fridman Podcast', 'start_seconds': 300, 'start_timestamp': '00:05:00'}, page_content="100 or a thousand times faster the normal listen uh I'm pretty sure nobody in their right mind listens to me at 1X they listen at 2x so I I can only imagine what 10x would feel like or I could actually understand it I usually default to 1.5x um you can do 2x but well actually if I'm trying to go if I'm listening somebody to go to in like sort of 15 20 minute segments to go to sleep")


In [15]:
from langchain_community.document_loaders.youtube import TranscriptFormat
from langchain_community.document_loaders import YoutubeLoader

loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=QsYGlZkevEg",
    add_video_info=True,
    transcript_format=TranscriptFormat.CHUNKS,
    chunk_size_seconds=30,
)
print("\n\n".join(map(repr, loader.load())))
# print(type((map(repr, loader.load()))))

Document(metadata={'source': 'https://www.youtube.com/watch?v=QsYGlZkevEg&t=0s', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'Unknown', 'view_count': 1858662, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/hq720.jpg', 'publish_date': '2023-02-04 00:00:00', 'length': 224, 'author': 'Saturday Night Live', 'start_seconds': 0, 'start_timestamp': '00:00:00'}, page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU.')

Document(metadata={'source': 'https://www.youtube.com/watch?v=QsYGlZkevEg&t=30s', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'Unknown', 'view_count': 1858662, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/hq720.jpg', 'publish_date': '2023-02-04 00:00:00', 'length': 224, 'author': 'Saturday Night Live', 'start_seconds': 30, 'start_timestamp': '00:00:30'}, page_content='THANK YOU VERY MUCH. I\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED "THE LAST OF US" ON HBO. FO

In [20]:
print(loader.load()[0].metadata["title"])

Pedro Pascal Monologue - SNL


In [13]:
from langchain_community.document_loaders.youtube import TranscriptFormat
from langchain_community.document_loaders import YoutubeLoader

loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=QsYGlZkevEg", add_video_info=True
)
# loader.load()
print(loader.load())

[Document(metadata={'source': 'QsYGlZkevEg', 'title': 'Pedro Pascal Monologue - SNL', 'description': 'Unknown', 'view_count': 1858662, 'thumbnail_url': 'https://i.ytimg.com/vi/QsYGlZkevEg/hq720.jpg', 'publish_date': '2023-02-04 00:00:00', 'length': 224, 'author': 'Saturday Night Live'}, page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED "THE LAST OF US" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE "GAME OF THRONES" AND "STAR WARS," BUT I\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES "THE MANDALORIA

In [11]:
loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=QsYGlZkevEg", add_video_info=False
)
print(loader.load())

[Document(metadata={'source': 'QsYGlZkevEg'}, page_content='LADIES AND GENTLEMEN, PEDRO PASCAL! [ CHEERS AND APPLAUSE ] >> THANK YOU, THANK YOU. THANK YOU VERY MUCH. I\'M SO EXCITED TO BE HERE. THANK YOU. I SPENT THE LAST YEAR SHOOTING A SHOW CALLED "THE LAST OF US" ON HBO. FOR SOME HBO SHOES, YOU GET TO SHOOT IN A FIVE STAR ITALIAN RESORT SURROUNDED BY BEAUTIFUL PEOPLE, BUT I SAID, NO, THAT\'S TOO EASY. I WANT TO SHOOT IN A FREEZING CANADIAN FOREST WHILE BEING CHASED AROUND BY A GUY WHOSE HEAD LOOKS LIKE A GENITAL WART. IT IS AN HONOR BEING A PART OF THESE HUGE FRANCHISEs LIKE "GAME OF THRONES" AND "STAR WARS," BUT I\'M STILL GETTING USED TO PEOPLE RECOGNIZING ME. THE OTHER DAY, A GUY STOPPED ME ON THE STREET AND SAYS, MY SON LOVES "THE MANDALORIAN" AND THE NEXT THING I KNOW, I\'M FACE TIMING WITH A 6-YEAR-OLD WHO HAS NO IDEA WHO I AM BECAUSE MY CHARACTER WEARS A MASK THE ENTIRE SHOW. THE GUY IS LIKE, DO THE MANDO VOICE, BUT IT\'S LIKE A BEDROOM VOICE. WITHOUT THE MASK, IT JUST SOUNDS

In [15]:
# Write to the file
with open(file_path, "w") as f:
    json.dump(modified_transcript, f, indent=4, ensure_ascii=False)
