# Update Blog Data

This notebook demonstrates how to update the blog data and vector store when new blog posts are published. It uses the utility functions from `utils_data_loading.ipynb`.

In [None]:
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()
import sys
import os

# Add the project root to the Python path
package_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
print(f"Adding package root to sys.path: {package_root}")
if package_root not in sys.path:
	sys.path.append(package_root)


In [None]:
notebook_dir = os.getcwd()
print(f"Current notebook directory: {notebook_dir}")
# change to the directory to the root of the project
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
print(f"Project root: {project_root}")
os.chdir(project_root)

## Update Blog Data Process

This process will:
1. Load existing blog posts
2. Process and update metadata
3. Create or update vector embeddings

In [None]:
from lets_talk.config import DATA_DIR,WEB_URLS, BASE_URL, BLOG_BASE_URL,DATA_DIR_PATTERN, QDRANT_COLLECTION,QDRANT_URL
data_dir = DATA_DIR
pattern = DATA_DIR_PATTERN 
base_url = BASE_URL
blog_base_url = BLOG_BASE_URL
web_urls = WEB_URLS
#index_only_published_posts = True
# print config values
print(f"Data directory: {data_dir}")
print(f"Data directory pattern: {pattern}")
print(f"Base URL: {base_url}")
print(f"Blog Base URL: {blog_base_url}")
print(f"Web URLs: {web_urls}")
# Ensure the data directory exists


In [None]:
from langchain_community.document_loaders import WebBaseLoader
import lets_talk.utils.blog as  blog_utils
docs = blog_utils.load_blog_posts(data_dir=data_dir,glob_pattern=pattern)
#TODO: implement `index_only_published_posts` to filter out unpublished posts
docs_with_data = blog_utils.update_document_metadata(docs,data_dir_prefix=data_dir+'/', base_url=base_url, blog_base_url=blog_base_url, remove_suffix=pattern)
loader = WebBaseLoader(web_urls)
web_docs = loader.load()
all_docs = docs_with_data + web_docs
print(f"Total documents loaded: {len(all_docs)}")

In [None]:
len(docs_with_data)

In [None]:
web_docs[0].metadata

In [None]:
# print urls
for doc in all_docs:
    print(f"URL: {doc.metadata['url'] if 'url' in doc.metadata else doc.metadata['source']}")
   

In [None]:
from collections import Counter
import pandas as pd

# Install matplotlib using uv
%pip install matplotlib -q

# Display document counts
print(f"Blog posts loaded: {len(docs_with_data)}")
print(f"Web pages loaded: {len(web_docs)}")
print(f"Total documents: {len(all_docs)}")

# Create a label counter for document sources
import matplotlib.pyplot as plt

# Extract source types
source_types = []
for doc in all_docs:
    source = doc.metadata.get('source', '')
    if source.startswith(data_dir):
        source_types.append('Blog Post')
    else:
        source_types.append('Web Page')

# Count document types
source_counter = Counter(source_types)

# Create a DataFrame for visualization
df_sources = pd.DataFrame(source_counter.items(), columns=['Source Type', 'Count'])

# Plot the data
plt.figure(figsize=(10, 6))
plt.bar(df_sources['Source Type'], df_sources['Count'], color=['#1f77b4', '#ff7f0e'])
plt.title('Document Source Distribution')
plt.ylabel('Count')
plt.grid(axis='y', linestyle='--', alpha=0.7)

for i, count in enumerate(df_sources['Count']):
    plt.text(i, count + 0.1, str(count), ha='center')

plt.tight_layout()
plt.show()

In [None]:
#split_docs = blog_utils.split_documents(all_docs, chunk_size=1000, chunk_overlap=200)
#split_docs[0]

In [None]:
from langchain.storage import InMemoryStore


In [None]:
# get list of sub subclass of InMemoryStore


In [None]:
from langchain.embeddings import init_embeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_qdrant import QdrantVectorStore
embedding_model = init_embeddings("ollama:snowflake-arctic-embed2:latest",base_url="http://host.docker.internal:11434")
semantic_chunker = SemanticChunker(
    embedding_model, # type: ignore
    breakpoint_threshold_type="percentile"
)
semantic_documents = semantic_chunker.split_documents(all_docs)

vector_store = QdrantVectorStore.from_documents(
        semantic_documents,
        embedding=embedding_model, #type: ignore
        collection_name=QDRANT_COLLECTION,
        url=QDRANT_URL,
        prefer_grpc=True,
    )

In [None]:
semantic_documents[1]

In [None]:

from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from lets_talk.config import (LLM_MODEL, LLM_TEMPERATURE)
from langchain.chat_models import init_chat_model
from langchain_core.tools import tool
from lets_talk.utils import format_docs

model = init_chat_model(LLM_MODEL, temperature=LLM_TEMPERATURE)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=model
)

bm25_retriever = BM25Retriever.from_documents(all_docs)  # type: ignore
retriever_list = [bm25_retriever,  multi_query_retriever]

equal_weighting = [1/len(retriever_list)] * len(retriever_list)

ensemble_retriever = EnsembleRetriever(
    retrievers=retriever_list, weights=[0.4, 0.6]  # Adjust weights as needed
)

@tool 
def retrive_documents(query: str) -> str:
    """Retrieve relevant documents from the knowledge base to answer user questions.
    
    Allways use this tool to search for specific information, facts, or content
    that may be in the document collection. Provide a clear search query related to
    what information to find.
    
    Args:
        query: The search query to find relevant documents
        
    Returns:
        Formatted text containing the retrieved document content
    """
    docs = ensemble_retriever.invoke(query) # type: ignore
    return format_docs(docs)

In [None]:
import logging
from lets_talk.config import RSS_URL
from lets_talk.tools.rss_feed_tool import RSSFeedTool
from lets_talk.agent_v2 import prompt
from langgraph.prebuilt import create_react_agent
tools =[retrive_documents]

#RSS_URL = 'https://thedataguy.pro/rss.xml'  # Replace with your actual RSS feed URL
if RSS_URL:
    logging.info(f"RSS URL is set to: {RSS_URL}")
    tools.append(RSSFeedTool(rss_url=RSS_URL))

model = init_chat_model(model="openai:gpt-4o-mini", temperature=LLM_TEMPERATURE)

agent = create_react_agent(
    model=model,
    tools=tools,
    prompt=prompt,
    version="v2",
)

 

In [None]:
from langchain_core.messages import HumanMessage
response = agent.invoke({"messages":[HumanMessage(content="What is the latest blog post about?")]})

In [None]:
print(response['messages'][-1].content)

In [None]:
from langchain_core.messages import HumanMessage
from lets_talk.utils import get_message_text
queries = [
    "What is the latest blog post about?",
    "Can you summarize the latest blog post?",
    "What are the key points from the latest blog post?",
    "How does TheDataGuy standout in AI arena?"
    "what is the dataguy all about?"
    "whats his current job?",
    "Tell me about ai Makerspace",
    "Tell me about build ship and share",
    "give me linkedin profile url",
    "give me the latest blog post url",
    "How many industries served by TheDataGuy?",
    "What is relationship between TheDataGuy and AI Makerspace?",
    "What is relationship between TheDataGuy and Sunrise Technologies?",
]

replies = []
for query in queries:
    print(f"Query: {query}")
    response = agent.invoke({"messages":[HumanMessage(content=query)]})
    reply = get_message_text(response["messages"][-1])
    print(f"Reply: {reply}")
    replies.append(reply)
    print("-" * 80)


# build a dataframe with queries and replies
import pandas as pd
df = pd.DataFrame({
    "Query": queries,
    "Reply": replies
})
# save the dataframe to a csv file
df
   


In [None]:
df

In [None]:
vector_store = QdrantVectorStore.from_existing_collection(        
        embedding=embedding_model, #type: ignore
        collection_name=QDRANT_COLLECTION,
        url=QDRANT_URL,
        prefer_grpc=True,
    )


In [None]:
vector_store.similarity_search("What is the best way to learn data science?", k=3)

In [None]:
import datetime
from typing import cast
from lets_talk.config import Configuration
from langchain_core.runnables import RunnableConfig
from lets_talk.utils import format_docs, get_message_text
from pydantic  import BaseModel
from langchain.chat_models import init_chat_model
from lets_talk.state import InputState
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_core.prompts import (ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    MessagesPlaceholder,
)


thread_config = {"thread_id": "thread-1"}


class SearchQuery(BaseModel):
    """Search the indexed documents for a query."""

    query: str


async def generate_query(
    state: InputState, *, config: RunnableConfig
) -> dict[str, list[str]]:
    """Generate a search query based on the current state and configuration.

    This function analyzes the messages in the state and generates an appropriate
    search query. For the first message, it uses the user's input directly.
    For subsequent messages, it uses a language model to generate a refined query.

    Args:
        state (State): The current state containing messages and other information.
        config (RunnableConfig | None, optional): Configuration for the query generation process.

    Returns:
        dict[str, list[str]]: A dictionary with a 'queries' key containing a list of generated queries.

    Behavior:
        - If there's only one message (first user input), it uses that as the query.
        - For subsequent messages, it uses a language model to generate a refined query.
        - The function uses the configuration to set up the prompt and model for query generation.
    """


    
    
    messages = state["messages"]

    
    configuration = Configuration.from_runnable_config(config)

    #get last HumanMessage from the messages
    if not messages or not isinstance(messages[-1], HumanMessage):
        raise ValueError("No HumanMessage found in the messages.")
    # Use the last HumanMessage as the user query
    if len(messages) == 1:
        # If it's the first message, use it directly as the query
        user_query = get_message_text(messages[0])
    else:
        # For subsequent messages, use the last HumanMessage
        if not isinstance(messages[-1], HumanMessage):
            raise ValueError("Last message is not a HumanMessage.")
        # Use the last HumanMessage as the user query

    user_query = get_message_text(messages[-1]) if messages else ""

    docs = retriever.invoke(user_query,config=config)  # type: ignore

    docs_str = format_docs(docs)

    # Create Tool message from doc_str
    tool_message = AIMessage(
        content=f"Here are the relevant documents:\n{docs_str}",
        additional_kwargs={"tool_calls": []},
    )

    

    # Feel free to customize the prompt, model, and other logic!
    prompt = ChatPromptTemplate.from_messages(
        [
            SystemMessagePromptTemplate.from_template(configuration.query_system_prompt),
            MessagesPlaceholder(variable_name="messages"),
        ]
    )
    model = init_chat_model(configuration.query_model,temperature=0).with_structured_output(
        SearchQuery
    )

    message_value = {
            "messages":[*messages, tool_message],
            "queries": "\n- ".join(state.get("queries", [])),
            "system_time": datetime.datetime.now(tz=datetime.timezone.utc).isoformat(),
        }
    
    chain = prompt | model

    generated = cast(SearchQuery, await chain.ainvoke(message_value, config))
    return {
        "queries": [generated.query],
    }


result = await generate_query(
    InputState(
        messages=[
            HumanMessage(
                content="How does TheDataGuy standout in AI arena?",
            ),

        ],

    ),
    config=RunnableConfig(config=thread_config), # type: ignore
)

print(result)

In [None]:
retriever.invoke(result['queries'][0],config=RunnableConfig(config=thread_config))  # type: ignore

In [None]:
result_1 = await generate_query(InputState(
        messages=[
            HumanMessage(
                content="Give me a rundown in number about his career and blog",
            ),
        ],
        queries=result["queries"],  # type: ignore
    ),
    config=RunnableConfig(config=thread_config), # type: ignore
)
print(result_1)

In [None]:
retriever.invoke(result_1['queries'][0],config=RunnableConfig(config=thread_config))  # type: ignore

In [None]:
retriever.invoke("Profession experience TheDataGuy",config=RunnableConfig(config=thread_config))  # type: ignore

In [None]:
response = agent.invoke({"messages":[HumanMessage(content="How does TheDataGuy standout in AI arena?")]},config=RunnableConfig(config=thread_config)) # type: ignore
print(get_message_text(response['messages'][-1]))

In [None]:
response = agent.invoke({"messages":[AIMessage(content="User is currently visiting `https://thedataguy.pro/` page"),HumanMessage(content="How does TheDataGuy standout in AI arena?")]},config=RunnableConfig(config=thread_config)) # type: ignore

print(get_message_text(response['messages'][-1]))


In [None]:
from langchain_core.messages import ToolMessage 
response = agent.invoke({"messages":[ToolMessage(content="User is currently visiting https://thedataguy.pro/about/", tool_call_id="current_visiting_page"), HumanMessage(content="Give me a rundown in number about his career and blog?")]},config=RunnableConfig(config=thread_config)) # type: ignore

print(get_message_text(response['messages'][-1]))

In [None]:
from langchain_core.messages import ToolMessage 
response = agent.invoke({"messages": [HumanMessage(content="What is TheDataGuy currently employeed?"), ToolMessage(content="User is currently visiting https://thedataguy.pro/about/", tool_call_id="current_visiting_page")]},config=RunnableConfig(config=thread_config)) # type: ignore

print(get_message_text(response['messages'][-1]))

In [None]:
docs = retriever.invoke("What is relationship between Sunrise Technologies and TheDataGuy?",config=RunnableConfig(config=thread_config))  # type: ignore
docs_str = format_docs(docs)

In [None]:
docs_str

In [None]:
response = agent.invoke({"messages": [HumanMessage(content="Where is TheDataGuy currently working?")]},config=RunnableConfig(config=thread_config)) # type: ignore
print(get_message_text(response['messages'][-1]))

In [None]:
response = agent.invoke({"messages": [HumanMessage(content="What is relationship between Sunrise Technologies and TheDataGuy?")]},config=RunnableConfig(config=thread_config)) # type: ignore

print(get_message_text(response['messages'][-1]))

In [None]:
result_2 = await generate_query(InputState(
        messages=[
            HumanMessage(
                content="Give me his X handle",
            ),
        ],
        queries=[*result["queries"],*result_1['queries']],  # type: ignore
    ),
    config=RunnableConfig(config=thread_config), # type: ignore
)
print(result_2)

In [None]:
retriever.invoke(result_2['queries'][0],config=RunnableConfig(config=thread_config))  # type: ignore

In [None]:
result_3 = await generate_query(InputState(
        messages=[
            HumanMessage(
                content="And youtube channel",
            ),
        ],
        queries=[*result["queries"],*result_1['queries'],*result_2['queries']],  # type: ignore
    ),
    config=RunnableConfig(config=thread_config), # type: ignore
)
print(result_3)

In [None]:
retriever.invoke(result_3['queries'][0],config=RunnableConfig(config=thread_config))  # type: ignore

In [None]:
retriever.invoke("Give me a rundown in number about his career and blog")

## Testing the Vector Store

Let's test the vector store with a few queries to make sure it's working correctly.

In [None]:
# Create a retriever from the vector store


# Test queries
test_queries = [
    "Give me projects list?",
    "What is RAGAS?",
    "How to build research agents?",
    "What is metric driven development?",
    "Who is TheDataGuy?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    docs = retriever.invoke(query)
    print(f"Retrieved {len(docs)} documents:")
    for i, doc in enumerate(docs):
        title = doc.metadata.get("post_title", "Unknown")
        url = doc.metadata.get("url", "No URL")
        print(f"{i+1}. {title} ({url})")

In [None]:
vector_store.client.close()