# Update Blog Data

This notebook demonstrates how to update the blog data and vector store when new blog posts are published. It uses the utility functions from `utils_data_loading.ipynb`.

In [None]:
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()
import sys
import os

# Add the project root to the Python path
package_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
print(f"Adding package root to sys.path: {package_root}")
if package_root not in sys.path:
	sys.path.append(package_root)


In [None]:
notebook_dir = os.getcwd()
print(f"Current notebook directory: {notebook_dir}")
# change to the directory to the root of the project
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
print(f"Project root: {project_root}")
os.chdir(project_root)

## Update Blog Data Process

This process will:
1. Load existing blog posts
2. Process and update metadata
3. Create or update vector embeddings

In [None]:
import lets_talk.utils.blog as  blog_utils
docs = blog_utils.load_blog_posts(data_dir="/home/mafzaal/source/mafzaal.github.io/posts",glob_pattern="index.md")




In [None]:
#write code to get docs by source 
docs_by_source = {}
for doc in docs:
    source = doc.metadata.get("source", "unknown")
    if source not in docs_by_source:
        docs_by_source[source] = []
    docs_by_source[source].append(doc)

In [None]:
docs_by_source['/home/mafzaal/source/mafzaal.github.io/posts/introduction-to-ragas/index.md']

In [None]:
docs_with_data = blog_utils.update_document_metadata(docs,data_dir_prefix="/home/mafzaal/source/mafzaal.github.io/posts/")

In [None]:
docs_with_data

In [None]:
# get source = /home/mafzaal/source/mafzaal.github.io/posts/2025/05/it-depends-on-the-context/index.md

source = "/home/mafzaal/source/mafzaal.github.io/posts/2025/05/it-depends-on-the-context/index.md"
new_docs = docs_by_source[source]


In [None]:
split_docs = blog_utils.split_documents(new_docs)

In [None]:
split_docs[0]

In [None]:
from langchain.embeddings import init_embeddings
embedding_model = init_embeddings("ollama:snowflake-arctic-embed2:latest",base_url="http://host.docker.internal:11434")
#embedding_model.embed_query("Hello, how are you?")

In [None]:
#vector_store = blog_utils = blog_utils.create_vector_store(docs,'./db/vector_store_tdg_3')

from langchain.embeddings import init_embeddings
from langchain_qdrant import QdrantVectorStore

embedding_model = init_embeddings("ollama:snowflake-arctic-embed2:latest")



vector_store = QdrantVectorStore.from_documents(
        split_docs,
        embedding=embedding_model, #type: ignore
        collection_name="the_data_guy_dev",
        url="http://127.0.0.1:6334",
        prefer_grpc=True,
    )




In [None]:
#vector_store.add_documents(split_docs)

In [None]:
vector_store = QdrantVectorStore.from_existing_collection(        
        embedding=embedding_model, #type: ignore
        collection_name="the_data_guy_dev",
        url="http://127.0.0.1:6334",
        prefer_grpc=True,
    )


In [None]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(["https://thedataguy.pro/analytics/","https://thedataguy.pro/projects/","https://thedataguy.pro/about/"])
web_docs = loader.load()


In [None]:
web_docs[2].page_content

In [None]:
from langchain.schema.document import Document
page_content = """ 

"""
new_docs = [Document(page_content=page_content, metadata={'url':'https://thedataguy.pro/contact'})]

In [None]:
vector_store.add_documents(web_docs)

In [None]:
vector_store.similarity_search("analytics", k=3)

## Testing the Vector Store

Let's test the vector store with a few queries to make sure it's working correctly.

In [None]:
# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

# Test queries
test_queries = [
    "Give me projects list?",
    "What is RAGAS?",
    "How to build research agents?",
    "What is metric driven development?",
    "Who is TheDataGuy?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    docs = retriever.invoke(query)
    print(f"Retrieved {len(docs)} documents:")
    for i, doc in enumerate(docs):
        title = doc.metadata.get("post_title", "Unknown")
        url = doc.metadata.get("url", "No URL")
        print(f"{i+1}. {title} ({url})")

In [None]:
vector_store.client.close()