In [2]:
!pip install pocketflow>=0.0.1
!pip install requests>=2.28.0
!pip install beautifulsoup4>=4.11.0
!pip install youtube-transcript-api>=0.6.0
!pip install openai>=1.0.0
!pip install pyyaml>=6.0
!pip install anthropic>=0.5.0

In [4]:
# utils/call_llm.py
from anthropic import Anthropic

def call_llm(prompt):
    client = Anthropic(api_key="your_api_key")
    response = client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=3000,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response.content[0].text

if __name__ == "__main__":
    test_prompt = "Hello, how are you?"
    response = call_llm(test_prompt)
    print(f"Test successful. Response: {response}")

Test successful. Response: I'm doing well, thank you for asking! I'm here and ready to help with any questions or topics you'd like to discuss. How are you today?


In [3]:
# utils/youtube_processor.py
import re
import requests
from bs4 import BeautifulSoup
from youtube_transcript_api import YouTubeTranscriptApi

def extract_video_id(url):
    """Extract YouTube video ID from URL"""
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    return match.group(1) if match else None

def get_video_info(url):
    """Get video title, transcript and thumbnail"""
    video_id = extract_video_id(url)
    if not video_id:
        return {"error": "Invalid YouTube URL"}

    try:
        # Get title using BeautifulSoup
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag = soup.find('title')
        title = title_tag.text.replace(" - YouTube", "")

        # Get thumbnail
        thumbnail_url = f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"

        # Get transcript
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        transcript = " ".join([entry["text"] for entry in transcript_list])

        return {
            "title": title,
            "transcript": transcript,
            "thumbnail_url": thumbnail_url,
            "video_id": video_id
        }
    except Exception as e:
        return {"error": str(e)}

if __name__ == "__main__":
    test_url = "https://www.youtube.com/watch?v=_1f-o0nqpEI&t"
    result = get_video_info(test_url)
    print(f"Title: {result.get('title')}")
    print(f"Transcript: {result.get('transcript', '')[:150]}...")
    print(f"Thumbnail URL: {result.get('thumbnail_url')}")
    print(f"Video ID: {result.get('video_id')}")

Title: DeepSeek, China, OpenAI, NVIDIA, xAI, TSMC, Stargate, and AI Megaclusters | Lex Fridman Podcast #459
Transcript: - The following is a conversation with Dylan Patel and Nathan Lambert. Dylan runs SemiAnalysis,
a well-respected research and analysis company that
sp...
Thumbnail URL: https://img.youtube.com/vi/_1f-o0nqpEI/maxresdefault.jpg
Video ID: _1f-o0nqpEI


In [10]:
# utils/html_generator.py
def html_generator(title, image_url, sections):
    """
    Generates an HTML string with a handwriting style using Tailwind CSS.

    :param title: Main title for the page ("Title 1").
    :param image_url: URL of the image to be placed below the main title.
    :param sections: A list of dictionaries, each containing:
        {
            "title": str (Title for the section e.g. "Title 2"),
            "bullets": [
                ("bold_text", "regular_text"),
                ("bold_text_2", "regular_text_2"),
                ...
            ]
        }
    :return: A string of HTML content.
    """
    # Start building the HTML
    html_template = f"""<!DOCTYPE html>
<html lang=\"en\">
<head>
  <meta charset=\"UTF-8\" />
  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />
  <title>Youtube Made Simple</title>
  <!-- Using Tailwind CSS CDN -->
  <link
    rel=\"stylesheet\"
    href=\"https://unpkg.com/tailwindcss@2.2.19/dist/tailwind.min.css\"
  />
  <!-- Google Font for a handwriting style -->
  <link rel=\"preconnect\" href=\"https://fonts.gstatic.com\" />
  <link
    href=\"https://fonts.googleapis.com/css2?family=Patrick+Hand&display=swap\"
    rel=\"stylesheet\"
  />
  <style>
    body {{
      background-color: #f7fafc;
      font-family: 'Patrick Hand', sans-serif;
    }}
    h1, h2 {{
      font-weight: 700;
      margin-bottom: 0.5rem;
    }}
    ul {{
      list-style-type: disc;
      margin-left: 1.5rem;
      margin-bottom: 1.5rem;
    }}
    li {{
      margin-bottom: 1rem;
    }}
    ol {{
      list-style-type: decimal;
      margin-left: 2rem;
      margin-top: 0.5rem;
    }}
    ol li {{
      margin-bottom: 0.2rem;
    }}
    .bullet-content ol {{
      margin-top: 0.3rem;
      margin-bottom: 0.3rem;
    }}
  </style>
</head>
<body class=\"min-h-screen flex items-center justify-center p-4\">
  <div class=\"max-w-2xl w-full bg-white rounded-2xl shadow-lg p-6\">
    <!-- Attribution header -->
    <div class="mb-6 text-right text-gray-500 text-sm">
      Generated by
      <a href="https://github.com/The-Pocket/Tutorial-Youtube-Made-Simple"
         class="underline hover:text-gray-700">
        Youtube Made Simple
      </a>
    </div>

    <!-- Title 1 -->
    <h1 class=\"text-4xl text-gray-800 mb-4\">{title}</h1>
    <!-- Image below Title 1 -->
    <img
      src=\"{image_url}\"
      alt=\"Placeholder image\"
      class=\"rounded-xl mb-6\"
    />"""

    # For each section, add a sub-title (Title 2, etc.) and bullet points.
    for section in sections:
        section_title = section.get("title", "")
        bullets = section.get("bullets", [])

        # Add the section's title (Title 2, Title 3, etc.)
        html_template += f"""
    <h2 class=\"text-2xl text-gray-800 mb-4\">{section_title}</h2>
    <ul class=\"text-gray-600\">"""

        # Create list items for each bullet pair
        for bold_text, normal_text in bullets:
            html_template += f"""
      <li>
        <strong>{bold_text}</strong><br />
        <div class="bullet-content">{normal_text}</div>
      </li>"""

        html_template += "\n    </ul>"

    # Close the main container and body
    html_template += """
  </div>
</body>
</html>"""

    return html_template

if __name__ == "__main__":
    sections_data = [
        {
            "title": "Title 2",
            "bullets": [
                ("First line of bullet 1", "Additional normal text."),
                ("First line of bullet 2", "Another detail in normal weight."),
            ]
        },
        {
            "title": "Title 3",
            "bullets": [
                ("First line of bullet 3", "More text in normal weight for bullet 3. <ol><li>1</li><li>2</li><li>3</li></ol>"),
            ]
        }
    ]
    html_content = html_generator("Title 1", "https://picsum.photos/600/300?grayscale", sections_data)
    with open("output.html", "w") as file:
        file.write(html_content)

In [8]:
# flow.py

from typing import List, Dict, Any, Tuple
import yaml
import logging
from pocketflow import Node, BatchNode, Flow

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Define the specific nodes for the YouTube Content Processor

class ProcessYouTubeURL(Node):
    """Process YouTube URL to extract video information"""
    def prep(self, shared):
        """Get URL from shared"""
        return shared.get("url", "")

    def exec(self, url):
        """Extract video information"""
        if not url:
            raise ValueError("No YouTube URL provided")

        logger.info(f"Processing YouTube URL: {url}")
        video_info = get_video_info(url)

        if "error" in video_info:
            raise ValueError(f"Error processing video: {video_info['error']}")

        return video_info

    def post(self, shared, prep_res, exec_res):
        """Store video information in shared"""
        shared["video_info"] = exec_res
        logger.info(f"Video title: {exec_res.get('title')}")
        logger.info(f"Transcript length: {len(exec_res.get('transcript', ''))}")
        return "default"

class ExtractTopicsAndQuestions(Node):
    """Extract interesting topics and generate questions from the video transcript"""
    def prep(self, shared):
        """Get transcript and title from video_info"""
        video_info = shared.get("video_info", {})
        transcript = video_info.get("transcript", "")
        title = video_info.get("title", "")
        return {"transcript": transcript, "title": title}

    def exec(self, data):
        """Extract topics and generate questions using LLM"""
        transcript = data["transcript"]
        title = data["title"]

        # Single prompt to extract topics and questions together
        prompt = f"""
You are an expert content analyzer. Given a YouTube video transcript, identify at most 5 most interesting topics discussed and generate at most 3 most thought-provoking questions for each topic.
These questions don't need to be directly asked in the video. It's good to have clarification questions.

VIDEO TITLE: {title}

TRANSCRIPT:
{transcript}

Format your response in YAML:

```yaml
topics:
  - title: |
        First Topic Title
    questions:
      - |
        Question 1 about first topic?
      - |
        Question 2 ...
  - title: |
        Second Topic Title
    questions:
        ...
```
        """

        response = call_llm(prompt)

        # Extract YAML content
        yaml_content = response.split("```yaml")[1].split("```")[0].strip() if "```yaml" in response else response


        parsed = yaml.safe_load(yaml_content)
        raw_topics = parsed.get("topics", [])

        # Ensure we have at most 5 topics
        raw_topics = raw_topics[:5]

        # Format the topics and questions for our data structure
        result_topics = []
        for topic in raw_topics:
            topic_title = topic.get("title", "")
            raw_questions = topic.get("questions", [])

            # Create a complete topic with questions
            result_topics.append({
                "title": topic_title,
                "questions": [
                    {
                        "original": q,
                        "rephrased": "",
                        "answer": ""
                    }
                    for q in raw_questions
                ]
            })

        return result_topics

    def post(self, shared, prep_res, exec_res):
        """Store topics with questions in shared"""
        shared["topics"] = exec_res

        # Count total questions
        total_questions = sum(len(topic.get("questions", [])) for topic in exec_res)

        logger.info(f"Extracted {len(exec_res)} topics with {total_questions} questions")
        return "default"

class ProcessContent(BatchNode):
    """Process each topic for rephrasing and answering"""
    def prep(self, shared):
        """Return list of topics for batch processing"""
        topics = shared.get("topics", [])
        video_info = shared.get("video_info", {})
        transcript = video_info.get("transcript", "")

        batch_items = []
        for topic in topics:
            batch_items.append({
                "topic": topic,
                "transcript": transcript
            })

        return batch_items

    def exec(self, item):
        """Process a topic using LLM"""
        topic = item["topic"]
        transcript = item["transcript"]

        topic_title = topic["title"]
        questions = [q["original"] for q in topic["questions"]]

        prompt = f"""You are a content simplifier for children. Given a topic and questions from a YouTube video, rephrase the topic title and questions to be clearer, and provide simple ELI5 (Explain Like I'm 5) answers.

TOPIC: {topic_title}

QUESTIONS:
{chr(10).join([f"- {q}" for q in questions])}

TRANSCRIPT EXCERPT:
{transcript}

For topic title and questions:
1. Keep them catchy and interesting, but short

For your answers:
1. Format them using HTML with <b> and <i> tags for highlighting.
2. Prefer lists with <ol> and <li> tags. Ideally, <li> followed by <b> for the key points.
3. Quote important keywords but explain them in easy-to-understand language (e.g., "<b>Quantum computing</b> is like having a super-fast magical calculator")
4. Keep answers interesting but short

Format your response in YAML:

```yaml
rephrased_title: |
    Interesting topic title in 10 words
questions:
  - original: |
        {questions[0] if len(questions) > 0 else ''}
    rephrased: |
        Interesting question in 15 words
    answer: |
        Simple answer that a 5-year-old could understand in 100 words
  - original: |
        {questions[1] if len(questions) > 1 else ''}
    ...
```
        """

        response = call_llm(prompt)

        # Extract YAML content
        yaml_content = response.split("```yaml")[1].split("```")[0].strip() if "```yaml" in response else response

        parsed = yaml.safe_load(yaml_content)
        rephrased_title = parsed.get("rephrased_title", topic_title)
        processed_questions = parsed.get("questions", [])

        result = {
            "title": topic_title,
            "rephrased_title": rephrased_title,
            "questions": processed_questions
        }

        return result


    def post(self, shared, prep_res, exec_res_list):
        """Update topics with processed content in shared"""
        topics = shared.get("topics", [])

        # Map of original topic title to processed content
        title_to_processed = {
            result["title"]: result
            for result in exec_res_list
        }

        # Update the topics with processed content
        for topic in topics:
            topic_title = topic["title"]
            if topic_title in title_to_processed:
                processed = title_to_processed[topic_title]

                # Update topic with rephrased title
                topic["rephrased_title"] = processed["rephrased_title"]

                # Map of original question to processed question
                orig_to_processed = {
                    q["original"]: q
                    for q in processed["questions"]
                }

                # Update each question
                for q in topic["questions"]:
                    original = q["original"]
                    if original in orig_to_processed:
                        processed_q = orig_to_processed[original]
                        q["rephrased"] = processed_q.get("rephrased", original)
                        q["answer"] = processed_q.get("answer", "")

        # Update shared with modified topics
        shared["topics"] = topics

        logger.info(f"Processed content for {len(exec_res_list)} topics")
        return "default"

class GenerateHTML(Node):
    """Generate HTML output from processed content"""
    def prep(self, shared):
        """Get video info and topics from shared"""
        video_info = shared.get("video_info", {})
        topics = shared.get("topics", [])

        return {
            "video_info": video_info,
            "topics": topics
        }

    def exec(self, data):
        """Generate HTML using html_generator"""
        video_info = data["video_info"]
        topics = data["topics"]

        title = video_info.get("title", "YouTube Video Summary")
        thumbnail_url = video_info.get("thumbnail_url", "")

        # Prepare sections for HTML
        sections = []
        for topic in topics:
            # Skip topics without questions
            if not topic.get("questions"):
                continue

            # Use rephrased_title if available, otherwise use original title
            section_title = topic.get("rephrased_title", topic.get("title", ""))

            # Prepare bullets for this section
            bullets = []
            for question in topic.get("questions", []):
                # Use rephrased question if available, otherwise use original
                q = question.get("rephrased", question.get("original", ""))
                a = question.get("answer", "")

                # Only add bullets if both question and answer have content
                if q.strip() and a.strip():
                    bullets.append((q, a))

            # Only include section if it has bullets
            if bullets:
                sections.append({
                    "title": section_title,
                    "bullets": bullets
                })

        # Generate HTML
        html_content = html_generator(title, thumbnail_url, sections)
        return html_content

    def post(self, shared, prep_res, exec_res):
        """Store HTML output in shared"""
        shared["html_output"] = exec_res

        # Write HTML to file
        with open("output.html", "w") as f:
            f.write(exec_res)

        logger.info("Generated HTML output and saved to output.html")
        return "default"

# Create the flow
def create_youtube_processor_flow():
    """Create and connect the nodes for the YouTube processor flow"""
    # Create nodes
    process_url = ProcessYouTubeURL(max_retries=2, wait=10)
    extract_topics_and_questions = ExtractTopicsAndQuestions(max_retries=2, wait=10)
    process_content = ProcessContent(max_retries=2, wait=10)
    generate_html = GenerateHTML(max_retries=2, wait=10)

    # Connect nodes
    process_url >> extract_topics_and_questions >> process_content >> generate_html

    # Create flow
    flow = Flow(start=process_url)

    return flow

In [11]:

# Enter YouTube URL to process:
url = "https://www.youtube.com/watch?v=JN3KPFbWCy8"

logger.info(f"Starting YouTube content processor for URL: {url}")

# Create flow
flow = create_youtube_processor_flow()

# Initialize shared memory
shared = {
    "url": url
}

# Run the flow
flow.run(shared)

# Report success and output file location
print("\n" + "=" * 50)
print("Processing completed successfully!")
print(f"Output HTML file: 'output.html'")
print("=" * 50 + "\n")


Processing completed successfully!
Output HTML file: 'output.html'

