# Update Blog Data

This notebook demonstrates how to update the blog data and vector store when new blog posts are published. It uses the utility functions from `utils_data_loading.ipynb`.

In [None]:
import sys
import os
from pathlib import Path
from dotenv import load_dotenv
import importlib.util

# Load environment variables
load_dotenv()

# Import utility functions from utils_data_loading.ipynb
# We'll do this by first converting the notebook to a Python module

In [None]:
# Function to import the utility module
def import_notebook_as_module(notebook_path, module_name="utils_module"):
    """
    Import a Jupyter notebook as a Python module.
    
    Args:
        notebook_path: Path to the notebook
        module_name: Name to give the module
        
    Returns:
        The imported module
    """
    import nbformat
    from importlib.util import spec_from_loader, module_from_spec
    from IPython.core.interactiveshell import InteractiveShell
    
    shell = InteractiveShell.instance()
    
    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)
    
    # Create a module
    spec = spec_from_loader(module_name, loader=None)
    module = module_from_spec(spec)
    sys.modules[module_name] = module
    
    # Execute only the code cells in the notebook
    for cell in nb.cells:
        if cell.cell_type == 'code':
            # Skip cells that start with certain keywords like "if __name__ == "__main__":"
            if 'if __name__ == "__main__":' in cell.source:
                continue
            
            # Execute the cell and store its content in the module
            code = shell.input_transformer_manager.transform_cell(cell.source)
            exec(code, module.__dict__)
    
    return module

In [None]:
# Import the utility functions
utils = import_notebook_as_module('utils_data_loading.ipynb')

# Now you can access all the functions from the utils module
print("Successfully imported utility functions.")

## Configuration

Set up the configuration for data processing.

In [None]:
# Configuration (can be overridden from .env file)
DATA_DIR = os.environ.get("DATA_DIR", "data/")
VECTOR_STORAGE_PATH = os.environ.get("VECTOR_STORAGE_PATH", "./db/vectorstore_v3")
BLOG_BASE_URL = os.environ.get("BLOG_BASE_URL", "https://thedataguy.pro/blog/")
FORCE_RECREATE_EMBEDDINGS = os.environ.get("FORCE_RECREATE_EMBEDDINGS", "false").lower() == "true"

print(f"Data Directory: {DATA_DIR}")
print(f"Vector Storage Path: {VECTOR_STORAGE_PATH}")
print(f"Blog Base URL: {BLOG_BASE_URL}")
print(f"Force Recreate Embeddings: {FORCE_RECREATE_EMBEDDINGS}")

## Update Blog Data Process

This process will:
1. Load existing blog posts
2. Process and update metadata
3. Create or update vector embeddings

In [None]:
# Process blog posts and create/update embeddings
result = utils.process_blog_posts(
    data_dir=DATA_DIR,
    create_embeddings=True,
    force_recreate_embeddings=FORCE_RECREATE_EMBEDDINGS
)

# Access the documents and vector store
documents = result["documents"]
stats = result["stats"]
vector_store = result["vector_store"]

print(f"\nProcessed {len(documents)} blog posts")
print(f"Vector store created/updated at: {VECTOR_STORAGE_PATH}")

## Testing the Vector Store

Let's test the vector store with a few queries to make sure it's working correctly.

In [None]:
# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 2})

# Test queries
test_queries = [
    "What is RAGAS?",
    "How to build research agents?",
    "What is metric driven development?",
    "Who is TheDataGuy?"
]

for query in test_queries:
    print(f"\nQuery: {query}")
    docs = retriever.invoke(query)
    print(f"Retrieved {len(docs)} documents:")
    for i, doc in enumerate(docs):
        title = doc.metadata.get("post_title", "Unknown")
        url = doc.metadata.get("url", "No URL")
        print(f"{i+1}. {title} ({url})")

## Schedule This Notebook

To keep the blog data up-to-date, you can schedule this notebook to run periodically. 
Here are some options:

1. Use a cron job to run this notebook with papermill
2. Set up a GitHub Action to run this notebook on a schedule
3. Use Airflow or another workflow management system

Example of running with papermill:
```bash
papermill update_blog_data.ipynb output_$(date +%Y%m%d).ipynb
```

In [None]:
# Save stats to a file for tracking changes over time
import json
from datetime import datetime

stats_dir = Path("stats")
stats_dir.mkdir(exist_ok=True)

# Add timestamp to stats
stats["timestamp"] = datetime.now().isoformat()

# Save stats
stats_path = stats_dir / f"blog_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(stats_path, "w") as f:
    json.dump(stats, f, indent=2)

print(f"Saved stats to {stats_path}")