In [1]:
from openai import OpenAI
from pydantic import BaseModel
import json
import numpy as np
from tqdm.auto import tqdm
from dotenv import load_dotenv
load_dotenv()

openai_client = OpenAI()

In [2]:
from pathlib import Path

from tqdm.auto import tqdm
from minsearch import Index

import docs

## Set up the dataset

In [3]:
data_folder = Path('../data_cache/youtube_videos/')
data_files = sorted(data_folder.glob("*.txt"))

In [5]:
len(data_files)

190

In [6]:
data_files[:2]

[WindowsPath('../data_cache/youtube_videos/-Gj7SaI-QW4.txt'),
 WindowsPath('../data_cache/youtube_videos/-HbQQ_bVdfE.txt')]

In [8]:
data_files[0].name

'-Gj7SaI-QW4.txt'

In [10]:
data_files[0].read_text(encoding='utf-8')[:500]

'0:00 everyone Welcome to our event this event\n0:02 is brought to you by datadox club which\n0:04 is a community of people who love data\n0:05 we have weekly events and today is one\n0:07 of such events if you want to find out\n0:09 more about the events we have there is a\n0:11 link in the description go there check\n0:13 it out and see what you like do not\n0:16 forget to subscribe to our YouTube\n0:17 channel this way you will get notified\n0:20 about amazing live streams like we have\n0:23 today and we'

In [11]:
documents = []

for f in tqdm(data_files):
    filename = f.name
    video_id, _ = filename.split('.')
    content = f.read_text(encoding='utf-8')
    chunks = docs.sliding_window(content, size=3000, step=1500)

    for chunk in chunks:
        chunk['video_id'] = video_id
        documents.append(chunk)

  0%|          | 0/190 [00:00<?, ?it/s]

In [13]:
documents[:2]

[{'start': 0,
  'content': "0:00 everyone Welcome to our event this event\n0:02 is brought to you by datadox club which\n0:04 is a community of people who love data\n0:05 we have weekly events and today is one\n0:07 of such events if you want to find out\n0:09 more about the events we have there is a\n0:11 link in the description go there check\n0:13 it out and see what you like do not\n0:16 forget to subscribe to our YouTube\n0:17 channel this way you will get notified\n0:20 about amazing live streams like we have\n0:23 today and we have a very cool slack\n0:26 Community where you can hang out with\n0:27 other data enthusiasts and one of the\n0:31 things we have in slack is an amazing\n0:33 initiative Adonis is one of those who\n0:37 organize a lot of things there called\n0:39 project of the week and we're just\n0:41 finishing wrapping up\n0:43 a week about learning rust so check it\n0:46 out it's an amazing Community project of\n0:48 the week is really a cool thing if you\n0:50 don't

In [14]:
len(documents)

7923

## Index the documents

In [15]:
from minsearch import Index

index = Index(
    text_fields=["content"],
    keyword_fields=["video_id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1f1a56eeae0>

## Agent tools

In [16]:
# Define the search function with proper type annotations:

In [17]:
from typing import Any, Dict, List, TypedDict

class SearchResult(TypedDict):
    """Represents a single search result entry."""
    start: int
    content: str
    video_id: str


def search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - video_id (str): YouTube video ID for the snippet.
    """
    return index.search(
        query=query,
        num_results=5,
    )

# Simple Agent

In [86]:
from pydantic_ai import Agent

instructions = """
Your role is to explore the topic provided by the user as deep as possible. 
Use the search function for that, and then based on the search results, 
create more queries to explore relevant topics.
""".strip()

agent_tools = [search]

agent = Agent(
    name="search",
    instructions=instructions,
    tools=agent_tools,
    model='gpt-4o-mini'
)

In [87]:
results = await agent.run(user_prompt='how do I make money with AI?')
print(results.output)

### Overview of Making Money with AI

1. **Skills Needed for AI Consulting**:
   - **Technical Proficiency**: Understanding data analysis, machine learning algorithms, and proficiency in programming languages like Python or R.
   - **Business Acumen**: Ability to connect AI capabilities with business needs, understanding how AI can solve real-world problems.
   - **Implementation Skills**: Being hands-on and able to not just recommend solutions but also implement them effectively for clients.
   - **Communication**: Strong interpersonal skills to explain technical concepts to non-technical stakeholders, including writing clear reports and delivering presentations.
   - **Project Management**: Ability to manage various projects simultaneously, ensuring they stay on track and meet client needs.
  
2. **Trending AI Applications in Various Industries**:
   - **Healthcare**: AI is increasingly used for diagnostics, treatment planning, and managing patient care through predictive analytics.


# Track tool call

In [88]:
for message in results.new_messages():
    print(message.kind)

    for part in message.parts:
        print(part.part_kind)
        if part.part_kind in ['tool-call', 'tool-return']:
            print(part)

    print()

request
user-prompt

response
tool-call
ToolCallPart(tool_name='search', args='{"query":"ways to make money with AI"}', tool_call_id='call_Om9AMHjxJL5pWcvCX2POPBOh')

request
tool-return
ToolReturnPart(tool_name='search', content=[{'start': 22500, 'content': "at kind of\n22:43 money you need for the next 12 to 18\n22:45 months for example it depends so I want\n22:48 to hire I don't know two three\n22:50 developers I want to hire\n22:52 whatever I need this amount amount of\n22:54 money for that I want to make some\n22:56 marketing is I don't know experiments I\n22:58 need some money for that so at all I\n22:59 need XYZ type of money\n23:02 um and then I go out with this I need\n23:05 this because all the investors is going\n23:06 to ask you what do you need that money\n23:08 for and if you don't have a good answer\n23:09 to that that's not a good sign\n23:12 um if you don't know what you what\n23:13 you're raising for so and and then yeah\n23:16 the the investors have their process and

In [89]:
# To see what's happening while we're waiting for query execution, let's add an event handler to see tool calls

from pydantic_ai.messages import FunctionToolCallEvent, FunctionToolResultEvent

async def print_function_calls(ctx, event):
    # Detect nested streams
    if hasattr(event, "__aiter__"):
        async for sub in event:
            await print_function_calls(ctx, sub)
        return

    if isinstance(event, FunctionToolCallEvent):
        print("TOOL CALL:", event.part.tool_name, event.part.args)

    if isinstance(event, FunctionToolResultEvent):
        print("TOOL RESULT:", event.result.tool_name)

In [None]:
# To see what's happening while we're waiting for query execution, let's add an event handler to see tool calls

from pydantic_ai.messages import FunctionToolCallEvent, FunctionToolResultEvent

async def print_function_calls_mod(ctx, event):
    # Detect nested streams
    if hasattr(event, "__aiter__"):
        async for sub in event:
            await print_function_calls(ctx, sub)
        return

    if isinstance(event, FunctionToolCallEvent):
        print("CALL →", event.part.tool_name, event.part.args_as_dict(), event.tool_call_id)
    elif isinstance(event, FunctionToolResultEvent):
        print("RES  ←", event.result.tool_name, event.tool_call_id, event.result.content)

In [91]:
question = 'how do I get into machine learning?'

results = await agent.run(
    user_prompt=question,
    event_stream_handler=print_function_calls
)

TOOL CALL: search {"query":"how to get started in machine learning"}
TOOL RESULT: search


In [93]:
question = 'how do I get into data engineering?'

results = await agent.run(
    user_prompt=question,
    event_stream_handler=print_function_calls_mod
)

TOOL CALL: search {"query":"how to get into data engineering"}
TOOL RESULT: search


In [94]:
results.new_messages()

[ModelRequest(parts=[UserPromptPart(content='how do I get into data engineering?', timestamp=datetime.datetime(2025, 10, 27, 3, 48, 40, 181724, tzinfo=datetime.timezone.utc))], instructions='Your role is to explore the topic provided by the user as deep as possible. \nUse the search function for that, and then based on the search results, \ncreate more queries to explore relevant topics.'),
 ModelResponse(parts=[ToolCallPart(tool_name='search', args='{"query":"how to get into data engineering"}', tool_call_id='call_strvFYVP66TMxQ2MHaZOtMgF')], usage=RequestUsage(input_tokens=180, output_tokens=18, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}), model_name='gpt-4o-mini-2024-07-18', timestamp=datetime.datetime(2025, 10, 27, 3, 48, 40, tzinfo=TzInfo(UTC)), provider_name='openai', provider_details={'finish_reason': 'tool_calls'}, provider_response_id='chatcmpl-CV8VkPv8kkZulmKdY2JXrWcFOCtEf', finish_reason='tool_call'),


In [95]:
from pydantic_ai.messages import ToolCallPart, ToolReturnPart

def print_tool_activity(res):
    seen = False
    for msg in res.new_messages():                    # only this run's messages
        for part in getattr(msg, "parts", []):        # inspect each part
            if isinstance(part, ToolCallPart):
                print(f"CALL → {part.tool_name} {part.args_as_dict()} (id={part.tool_call_id})")
                seen = True
            elif isinstance(part, ToolReturnPart):
                print(f"RES  ← {part.tool_name} (id={part.tool_call_id}) -> {part.content}")
                seen = True
    if not seen:
        print("(no tool calls in this run)")


In [96]:
print_tool_activity(results)

CALL → search {'query': 'how to get into data engineering'} (id=call_strvFYVP66TMxQ2MHaZOtMgF)
RES  ← search (id=call_strvFYVP66TMxQ2MHaZOtMgF) -> [{'start': 4500, 'content': "he the standard tools\n4:51 didn't work anymore\n4:53 so i need to find solutions different\n4:57 solutions to actually\n4:58 yeah work this out and back then i\n5:01 started with hadoop so\n5:03 uh hadoop was really the the thing back\n5:06 then\n5:07 and uh that really turned out really\n5:10 good\n5:11 and so that's how i got into into the\n5:13 into the field back then\n5:15 big data now now data engineering data\n5:17 science\n5:18 i became then i switched basically\n5:22 to data engineer became a team lead for\n5:25 data engineering\n5:28 this year i started leading a data lab\n5:31 but since this month i'm basically went\n5:34 full time with\n5:35 teaching data engineering with my with\n5:37 my academy at learndataengineering.com\n5:40 okay so yeah yeah that's how i got here\n5:43 yeah we'll talk a bit\n5:

In [52]:
print(results.output)

Getting into machine learning (ML) can be a fulfilling endeavor, but it requires a systematic approach. Here are some essential steps and considerations for those looking to embark on this journey:

### 1. **Understand the Basics**
   - **Foundational Knowledge**: Familiarize yourself with the fundamental concepts of machine learning, including supervised learning, unsupervised learning, and reinforcement learning.
   - **Mathematical Foundations**: Develop a solid grasp of key mathematical concepts such as linear algebra, calculus, probability, and statistics, as they are crucial for understanding ML algorithms.

### 2. **Learn Programming Skills**
   - **Programming Language**: Python is the most widely used language in ML, so focusing on Python would be beneficial. Start with basic syntax and gradually move to more complex libraries like NumPy, Pandas, and Scikit-learn.
   - **Projects**: Engage in small coding projects to apply your learning. For instance, try automating tasks usin

In [45]:
results.new_messages()[1]

ModelResponse(parts=[ToolCallPart(tool_name='search', args='{"query":"how to get into machine learning"}', tool_call_id='call_w8rYQI8zdyY2pGaHRRyX55FM')], usage=RequestUsage(input_tokens=180, output_tokens=18, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}), model_name='gpt-4o-mini-2024-07-18', timestamp=datetime.datetime(2025, 10, 27, 2, 47, 32, tzinfo=TzInfo(UTC)), provider_name='openai', provider_details={'finish_reason': 'tool_calls'}, provider_response_id='chatcmpl-CV7Ya1w49Sj32zYzqxS3nWvEc0IbP', finish_reason='tool_call')

In [46]:
results.new_messages()[2]

ModelRequest(parts=[ToolReturnPart(tool_name='search', content=[{'start': 19500, 'content': "n front of\n17:51 other people and speak their language so\n17:54 use their terminology and then you kind\n17:56 of become I bi or like they treat you as\n18:01 one of their own right so then this way\n18:02 you get respect so these things that we\n18:05 discussed they're pretty interesting to\n18:08 discuss like how related they are to\n18:12 successful machine learning\n18:15 projects uh very very uh well\n18:18 intertwined um\n18:21 so especially with machine learning uh\n18:24 like uh you know any kind of project but\n18:26 specifically machine learning one of the\n18:28 challenges with machine learning is just\n18:30 how complex it is and how many how much\n18:33 support you need from every part of the\n18:35 business for machine learning to work\n18:37 you know any software developer will\n18:39 tell you like getting support for a new\n18:41 project is difficult and that's\n18:43 absolute

In [55]:
results.usage()


RunUsage(input_tokens=20668, output_tokens=872, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, requests=3, tool_calls=4)

# Improve search depth (deep research)

In [56]:
#After consulting with ChatGPT, we came up with an improved version for the prompt:

In [57]:
instructions = """
You are a deep research agent exploring topics using a proprietary podcast/video database.

GOAL

Given a user question, perform a structured multi-stage exploration to deeply understand the topic and all relevant adjacent ideas.

PROCESS

Stage 1 — Initial Search

- Take the user's question and perform 1-2 broad searches.
- Summarize the main insights from the top results.
- Note related subtopics or recurring ideas.

Stage 2 — Expansion

- Generate 5 targeted follow-up search queries based on the Stage 1 insights.
    Example: If the user asks "how to make money with AI", follow-ups might be:
    - "AI startup business models"
    - "freelancing with AI tools"
    - "AI side hustles"
    - "ethical considerations in AI monetization"
    - "AI job opportunities"
- Perform each search and summarize key takeaways with references.

Stage 3 — Deep Dive

- Based on findings so far, generate 5 even deeper or contrasting queries.
    These might cover debates, frameworks, case studies, or expert insights.
- Perform these searches and extract detailed insights.


OUTPUT FORMAT

At the end, output a structured summary:

**Main Question:** <original question>  
**Overview:** A 1-paragraph synthesis of the key ideas.

**Stage 1 Findings:**  
- Bullet summaries + [reference links]

**Stage 2 Expansions:**  
- Subtopic summaries + [reference links]

**Stage 3 Deep Dives:**  
- In-depth findings or nuanced perspectives + [reference links]

**References:**  
List clickable YouTube links in this format:  
[Title](https://www.youtube.com/watch?v=<video_id>&t=<timestamp>s)

RULES

- Always use `search()` to gather evidence before summarizing.
- Derive each new query from the content of previous results.
- Only use information returned by search() as references.
- Always include at least 5 unique searches.
- Prefer quality and diversity over repetition.
""".strip()

In [58]:
agent_tools = [search]

agent = Agent(
    name="search",
    instructions=instructions,
    tools=agent_tools,
    model='gpt-4o-mini'
)
question = 'how do I get into machine learning?'

results = await agent.run(
    user_prompt=question,
    event_stream_handler=print_function_calls
)

TOOL CALL: search {"query":"how to get into machine learning"}


In [62]:
results

AgentRunResult(output="**Main Question:** How do I get into machine learning?  \n**Overview:** To start a career in machine learning, one must cultivate a blend of programming skills, mathematical knowledge, and domain understanding. It's critical to begin with foundational skills, especially in Python, explore basic models, and gradually dive into more intricate concepts. Learning how to communicate the value of machine learning projects to stakeholders is also important, as is staying updated with industry trends and advancements.\n\n**Stage 1 Findings:**  \n- **Skill Sets Needed**: Successful machine learning professionals need strong coding skills (typically in Python), a solid grasp of mathematical concepts (like statistics), and the ability to communicate effectively with non-technical stakeholders. [Video - Various discussions on machine learning] (https://www.youtube.com/watch?v=su2M058m3Lw)\n- **Project Complexity**: Machine learning projects generally require more resources a

In [61]:
for msg in results.new_messages():
    if isinstance(msg, FunctionToolCallEvent):
        # In 1.6.0, the call info lives under .part
        tool_name = msg.part.tool_name
        tool_args = msg.part.args_as_dict()      # safe dict of args
        call_id   = msg.tool_call_id             # stable ID to match result
        print(f"CALL → {tool_name} {tool_args} (id={call_id})")

    elif isinstance(msg, FunctionToolResultEvent):
        # Result info is on .result
        tool_name = msg.results.tool_name
        call_id   = msg.tool_call_id
        # .content usually carries your tool's return payload (modelled as a Part)
        tool_return = msg.results.content
        print(f"RES  ← {tool_name} (id={call_id}) -> {tool_return}")

## Use structured Output to force stages

In [97]:
from pydantic import BaseModel, Field
from typing import List

class Reference(BaseModel):
    """Citations that directly tie each claim to a verifiable source."""
    quote: str = Field(..., description="A short, verbatim quote (2–4 sentences) from the database snippet.")
    youtube_id: str = Field(..., description="Video ID")
    timestamp: str = Field(..., description="Timestamp to the exact position in the video where the quote is, 'h:mm:ss' or 'mm:ss' format.")

class Keyword(BaseModel):
    """Research results for a specific keyword"""
    search_keyword: str = Field(..., description="Exact keyword used for search.")
    summary: str = Field(..., description="Short summary of the search result.")
    references: List[Reference] = Field(..., description="Specific references to help us track the findings of the research.")
    relevance_summary: str = Field(..., description="1 sentence for each reference explainig how it supports the keyword's summary — ensure factual consistency.")
    other_ideas: str = Field(..., description="Free-form description of related or complimentary ideas to explore in next stages.")

class StageReport(BaseModel):
    """Summarizes what was found during a single exploration stage."""
    stage: int = Field(..., description="Stage number (1 for initial search, 2 for expansion, 3 for deep dive).")
    keywords: List[Keyword] = Field(..., description="Search keywords ")
    summary: str = Field(..., description="A concise synthesis of insights found in this stage, summarizing themes and discoveries from all queries executed in the stage.")

class Claim(BaseModel):
    """A factual statement supported by one specific reference."""
    description: str = Field(..., description=(
        "A short paragraph (3–4 sentences) that paraphrases the meaning of the quote in your own words. "
        "It must stay faithful to the factual content of the quote — no speculation or extrapolation."
    ))
    relevance_check: str = Field(..., description=(
        "1–2 sentences explaining *why* this quote supports the claim — a brief justification to ensure factual grounding."
    ))
    reference: Reference = Field(..., description=(
        "A direct quote that explicitly supports or demonstrates the statement made in 'description'. "
        "The claim should be a paraphrase or interpretation of this quote."
    ))

class ArticleSection(BaseModel):
    """One thematic part of the final article, containing multiple claims."""
    title: str = Field(..., description="A concise section title summarizing the theme.")
    claims: List[Claim] = Field(..., description="3–4 claims that explore different aspects of this section's theme.")

class ActionPoint(BaseModel):
    """Practical takeaways from the research."""
    point: str = Field(..., description="A concrete recommendation, insight, or action derived from the research.")
    relevance_check: str = Field(..., description="Explain how the referenced quote supports this action point — must show logical connection, not assumption.")
    reference: Reference = Field(..., description="Source supporting this action point.")

class Article(BaseModel):
    """The final synthesized output — a structured article summarizing all research stages."""
    title: str = Field(..., description="Compelling headline summarizing the topic and main insight (7-10 words).")
    introduction: str = Field(..., description="A short overview (3-4 paragraphs) explaining what the research explored and why it matters.")
    sections: List[ArticleSection] = Field(..., description="5-8 well-structured sections presenting grouped claims by topic.")
    action_points: List[ActionPoint] = Field(..., description="Optional 3-5 key insights or recommendations derived from the findings.")
    conclusion: str = Field(..., description="Final synthesis paragraph summarizing the broader takeaways and closing thoughts.")

class ResearchReport(BaseModel):
    """The complete record of exploration across all stages, culminating in the final article."""
    stages: List[StageReport] = Field(..., description="Exploration stage reports (Stage 1–3) detailing the search process.")
    article: Article = Field(..., description="The final article.")

## Ground result to keep relevance and check 

In [99]:
instructions = """
You are a deep research agent exploring topics using a proprietary podcast/video database.

Given a user question, perform a structured, multi-stage exploration to understand
the topic deeply and comprehensively through the database.

## DATA SOURCE

- You can only use the results from the `search()` function.
- Each search result includes `video_id` and snippet text.
- All references must link to YouTube URLs derived from the database and contain a quote
- Do not create, infer, or guess podcast names, titles, or timestamps.

## PROCESS

Stage 1 — Initial Search

1. Use the user's question as the first query with `search()`.
2. Summarize the most relevant insights from the results.
3. Identify key ideas, recurring themes, or related questions.

Stage 2 — Expansion

1. Generate 5-7 follow-up queries that explore related subtopics or complementary ideas.
2. For each query, call `search()` again.
3. Summarize the main insights from each result.

Stage 3 — Deep Dive

1. From the Stage 2 findings, generate 5-7 deeper or contrasting exploration queries.
2. For each, call `search()` again and summarize findings.
3. At the end of Stage 3, write an article that describes everything you discovered.

## Exploration rules

You are not allowed to stop until you perform at least 11 queries:

- 1 initial query for stage 1
- 5-7 follow up queries for stage 2
- 5-7 deeper exploration queries for stage 3

## References

When generating a claim or action point:

- Read the reference quote carefully.
- Write the claim as a faithful paraphrase or inference strictly supported by the quote.
- After each claim, provide a 1–2 sentence "relevance_check" explaining why the quote supports it.
- Do not generalize or introduce new facts not mentioned in the quote.

## Article

- The resulting article should contain an introduction, 5-8 sections and a conclusion.
- Each section should present 3-4 claims (backed by references) grouped by topics
- Each claim should be a paragraph with 3-4 sentences.
"""

agent = Agent(
    name="search",
    instructions=instructions,
    tools=agent_tools,
    model='gpt-4o-mini',
    output_type=ResearchReport
)

In [100]:
results = await agent.run(
    user_prompt=question,
    event_stream_handler=print_function_calls_mod
)

TOOL CALL: search {"query":"how to get into data engineering"}
TOOL RESULT: search
TOOL CALL: search {"query": "skills needed for data engineering"}
TOOL CALL: search {"query": "data engineering career progression"}
TOOL CALL: search {"query": "data engineering certifications"}
TOOL CALL: search {"query": "data engineering tools and technologies"}
TOOL CALL: search {"query": "data engineering job roles and responsibilities"}
TOOL RESULT: search
TOOL RESULT: search
TOOL RESULT: search
TOOL RESULT: search
TOOL RESULT: search
TOOL CALL: search {"query": "top data engineering courses"}
TOOL CALL: search {"query": "data engineering interview preparation tips"}
TOOL CALL: search {"query": "tools for data engineering beginners"}
TOOL CALL: search {"query": "common mistakes in data engineering"}
TOOL RESULT: search
TOOL RESULT: search
TOOL RESULT: search
TOOL RESULT: search


In [101]:
print_tool_activity(results)

CALL → search {'query': 'how to get into data engineering'} (id=call_Zil0GOKowxqCGw93Es6wWqeD)
RES  ← search (id=call_Zil0GOKowxqCGw93Es6wWqeD) -> [{'start': 4500, 'content': "he the standard tools\n4:51 didn't work anymore\n4:53 so i need to find solutions different\n4:57 solutions to actually\n4:58 yeah work this out and back then i\n5:01 started with hadoop so\n5:03 uh hadoop was really the the thing back\n5:06 then\n5:07 and uh that really turned out really\n5:10 good\n5:11 and so that's how i got into into the\n5:13 into the field back then\n5:15 big data now now data engineering data\n5:17 science\n5:18 i became then i switched basically\n5:22 to data engineer became a team lead for\n5:25 data engineering\n5:28 this year i started leading a data lab\n5:31 but since this month i'm basically went\n5:34 full time with\n5:35 teaching data engineering with my with\n5:37 my academy at learndataengineering.com\n5:40 okay so yeah yeah that's how i got here\n5:43 yeah we'll talk a bit\n5:

In [106]:
results.output

ResearchReport(stages=[StageReport(stage=1, keywords=[Keyword(search_keyword='how to get into data engineering', summary='The rise of interest in data engineering corresponds with its increasing necessity for automating data processes and building data pipelines.', references=[Reference(quote='...now I need to automate it now I need to build something around it how to actually bring this into production...', youtube_id='IrZPAG6OBqo', timestamp='7:01'), Reference(quote='...when you look back a few years... people were starting with data science... at some point... they realized... okay this is what data engineering...', youtube_id='IrZPAG6OBqo', timestamp='7:36')], relevance_summary='The quotes highlight the growing recognition that data engineering is essential for automating data processes and integrating them into production, reflecting the evolving demands in the field.', other_ideas='Exploring different pathways into data engineering, including educational resources, coding skills,

In [108]:
report= results.output

In [109]:
for stage in report.stages:
    for kw in stage.keywords:
        print(kw.search_keyword)
    print()

how to get into data engineering

skills needed for data engineering
data engineering career progression
data engineering certifications
data engineering job roles and responsibilities

top data engineering courses
data engineering interview preparation tips
tools for data engineering beginners
common mistakes in data engineering



In [110]:
report.article

Article(title='Navigating a Career in Data Engineering: Insights and Pathways', introduction='Data engineering has emerged as a critical field in the tech industry, responding to the ever-growing need for efficient data management and processing. Aspiring data engineers must understand the skill sets, career trajectories, and requisite knowledge areas to establish a foothold in this evolving domain. This article synthesizes insights gathered through podcasts and discussions with experienced professionals, emphasizing actionable advice on how to enter and succeed in data engineering.', sections=[ArticleSection(title='Understanding Data Engineering: Skills and Competencies', claims=[Claim(description="A strong coding foundation is crucial for data engineers, with languages like Python and SQL considered essential. Familiarity with these programming languages enhances one's capability to manipulate and manage data efficiently, which is foundational in this field.", relevance_check='The qu

In [112]:
article = report.article

In [114]:
def to_link(reference) -> str:
    """
    Converts the timestamp to a YouTube URL with a proper time offset.
    Supports both 'h:mm:ss' and 'mm:ss' formats.
    """
    if not reference.timestamp:
        return f"https://www.youtube.com/watch?v={reference.youtube_id}"

    ts = reference.timestamp.strip()
    if not ts:
        return f"https://www.youtube.com/watch?v={reference.youtube_id}"

    parts = ts.split(":")

    try:
        parts = [int(p) for p in parts]
    except ValueError:
        return f"https://www.youtube.com/watch?v={reference.youtube_id}"

    if len(parts) == 3: # h:mm:ss
        hours, minutes, seconds = parts
    elif len(parts) == 2: # mm:ss
        hours, minutes, seconds = 0, parts[0], parts[1]
    elif len(parts) == 1:
        hours, minutes, seconds = 0, 0, parts[0]

    total_seconds = hours * 3600 + minutes * 60 + seconds
    return f"https://www.youtube.com/watch?v={reference.youtube_id}&t={total_seconds}s"

def diplay_reference(reference: Reference): 
    return f"[{reference.quote}]({to_link(reference)})" 


In [115]:
report = results.output

# Display stage-by-stage findings
for stage in report.stages:
    print('Stage:', stage.stage)
    for kw in stage.keywords:
        print('  keyword:', kw.search_keyword)
        print('  summary:', kw.summary)
        print('  references:', [diplay_reference(r) for r in kw.references])
    print(stage.summary)

# Display the final article
article = report.article
print('#', article.title)
print('## Introduction')
print(article.introduction)

for section in article.sections:
    print('##', section.title)
    for claim in section.claims:
        print(claim.description, '(', diplay_reference(claim.reference), ')')

print('## Action Points')
for action_point in article.action_points:
    print('*', action_point.point, diplay_reference(action_point.reference))

print('## Conclusion')
print(article.conclusion)

Stage: 1
  keyword: how to get into data engineering
  summary: The rise of interest in data engineering corresponds with its increasing necessity for automating data processes and building data pipelines.
  references: ['[...now I need to automate it now I need to build something around it how to actually bring this into production...](https://www.youtube.com/watch?v=IrZPAG6OBqo&t=421s)', '[...when you look back a few years... people were starting with data science... at some point... they realized... okay this is what data engineering...](https://www.youtube.com/watch?v=IrZPAG6OBqo&t=456s)']
The search results indicate that data engineering has become increasingly important in recent years as companies strive to automate data processes, requiring skill sets that bridge data science and engineering.
Stage: 2
  keyword: skills needed for data engineering
  summary: A successful data engineer should leverage strong coding skills, familiarity with data systems, and knowledge of various d