In [1]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.3-py3-none-any.whl.metadata (17 kB)
Downloading youtube_transcript_api-0.6.3-py3-none-any.whl (622 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.3/622.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.3


In [2]:
# imports

import os

import requests
from dotenv import load_dotenv
from IPython.display import Markdown, display

from youtube_transcript_api import YouTubeTranscriptApi
import re

import ollama

In [3]:
# Constants
MODEL = "llama3.2"

In [4]:
class YoutubeVideoID:
    def __init__(self, url):
        self.url = url
        self.video_id = self.extract_video_id(url)

    def extract_video_id(self, url):
        """
        Extracts the YouTube video ID from a given URL.
        Supports both regular and shortened URLs.
        """
        # Regular expression to match YouTube video URL and extract the video ID
        regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
        match = re.match(regex, url)
        
        if match:
            return match.group(1)
        else:
            raise ValueError("Invalid YouTube URL")

    def __str__(self):
        return f"Video ID: {self.video_id}"

In [5]:
# Example usage
video_url = "https://www.youtube.com/watch?v=kqaMIFEz15s"

yt_video = YoutubeVideoID(video_url)
print(yt_video)

Video ID: kqaMIFEz15s


In [6]:
def get_transcript(video_id, language='en'):
    try:
        # Try to get the transcript in the desired language (Indonesian by default)
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        # Join all the 'text' fields into a single string
        return " ".join([item['text'] for item in transcript])
    except Exception as e:
        print(f"Error fetching transcript: {e}")
        return None


In [7]:
def split_text(text, chunk_size=3000):
    """
    Splits large text into smaller chunks based on the given chunk size.
    Ensures that chunks end with a full stop where possible to maintain sentence integrity.
    
    :param text: str, the text to be split
    :param chunk_size: int, maximum size of each chunk (default 3000 characters)
    :return: list of str, where each str is a chunk of text
    """
    chunks = []
    while len(text) > chunk_size:
        # Find the last full stop within or at the chunk size
        split_point = text.rfind('.', 0, chunk_size + 1)  # +1 to include the period itself if it's at chunk_size
        if split_point == -1:  # No period found within the chunk size
            split_point = chunk_size
        
        # Append the chunk, ensuring we don't strip spaces that might be part of the sentence structure
        chunks.append(text[:split_point + 1] if split_point != chunk_size else text[:chunk_size])
        text = text[split_point + 1:] if split_point != chunk_size else text[chunk_size:]
    
    # Add the remaining text as the final chunk, only strip if there's content
    if text:
        chunks.append(text.strip())
    
    return chunks

In [16]:
# Function to summarize text using ChatGPT
def summarize_text(text):
    try:
        system_prompts = """
        You're a very concise assistant that go through Youtube Video transcripts and generate a list of relevant chapters with their matching keyframes time (starttime and endtime), so the users can then jump to the section that are interesting to them.
        
        - Capture the key points of the content.
        - Keep the summary brief and easy to understand.
        - Avoid summarizing overly lengthy texts or breaking them into excessively short summaries.
        - Use bullet points where appropriate to enhance clarity and structure.
        - precise clearly their starttime and endtime in the video
        """        
        messages = [
            {"role": "system", "content": system_prompts },
            {"role": "user", "content": f"Summarize the following text:\n{text}"}
        ]
        
        response = ollama.chat(model=MODEL, messages=messages)
        # print(response['message']['content'])
        return response['message']['content']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return None

In [9]:
video_url = "https://www.youtube.com/watch?v=kqaMIFEz15s"
yt_video = YoutubeVideoID(video_url)

In [10]:
# Fetch transcript using the video ID
transcript_text = get_transcript(yt_video.video_id)
print(len(transcript_text))

16073


In [18]:
# Function to summarize text using ChatGPT
def generate_chapters(summaries):
    try:
        system_prompts = """
        You take a list of summaries from a video with their associated startime and endtime, and find an accurate timeline for the video the the user can jump to different sections.
        
        - Find an accurate title for each section you come up with.
        - Specify a starttime and and entime with bullet points.
        - you do not need to have the same number of section as the summaries that are passed to you.
        """        
        messages = [
            {"role": "system", "content": system_prompts },
            {"role": "user", "content": f"Find the different sections from these summaries:\n{summaries}"}
        ]
        
        response = ollama.chat(model=MODEL, messages=messages)
        # print(response['message']['content'])
        return response['message']['content']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return None

In [20]:
transcript_chunks = split_text(transcript_text)

# Now you can summarize each chunk individually
summaries = []
for chunk in transcript_chunks:  # Only the first element
    summary = summarize_text(chunk)
    summaries.append(summary)

# Combine the individual summaries into one
full_summary = " ".join(summaries)
display(Markdown(generate_chapters(full_summary)))


Here is a summary of each section:

**Section 1: Reviewing Past Predictions**

* Title: "Reviewing Past Predictions"
* Starttime: 0:30
* Endtime: 3:45
* Keyframe Times:
	+ Chapter 1 review (0:30 - 1:15)
	+ Past predictions (1:15 - 2:00)
	+ Deep fakes (2:00 - 3:45)

**Section 2: Implications of Generative AI on Cybersecurity**

* Title: "Implications of Generative AI on Cybersecurity"
* Starttime: 0:45
* Endtime: 12:45
* Keyframe Times:
	+ Prompt injection attacks (0:45 - 2:00)
	+ Lack of defenses (3:30 - 5:00)
	+ Cybersecurity using Generative AI (6:15 - 9:00)

**Section 3: Quantum Computers and Cryptography**

* Title: "Quantum Computers and Cryptography"
* Starttime: 10:30
* Endtime: 12:45
* Keyframe Times:
	+ Quantum computers posing a threat to traditional cryptography (10:30 - 11:15)
	+ Researchers working on quantum-safe cryptographic methods (11:15 - 12:00)

**Section 4: Action Required**

* Title: "Action Required"
* Starttime: 12:45
* Endtime: 13:50
* Keyframe Times:
	+ Organizations need to start working on quantum-safe cryptography (12:45 - 14:30)
	+ Developing projects to convert to new quantum-safe cryptography (14:30 - 15:20)

**Section 5: Additional Resources**

* Title: "Additional Resources"
* Starttime: 16:00
* Endtime: 17:10
* Keyframe Times:
	+ Watching videos on the IBM Technology Channel for deeper dives into these topics (16:00 - 17:10)