### Setup & Installations

In [1]:
import sys
# if "google.colab" in sys.modules or True:
#     print(" Installing required packages...")
#     %pip install -q playwright>=1.40.0 python-dotenv>=1.0.0 beautifulsoup4>=4.12.0 markdownify>=0.11.6 nest-asyncio>=1.5.0
    
#     # Install Playwright browsers
#     print(" Installing Playwright browsers...")
#     import subprocess
#     subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True, capture_output=True)

# print(" Packages ready")

### Imports & Environment Setup

In [3]:
import os
import sys
import json
import time
from pathlib import Path
from urllib.parse import urlparse
from dotenv import load_dotenv
import nest_asyncio

# Enable nested asyncio
nest_asyncio.apply()

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

# Load environment
load_dotenv(project_root / ".env")

# Check for API key (OpenRouter preferred, OpenAI as fallback)
openrouter_key = os.getenv("OPENROUTER_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

if not openrouter_key and not openai_key:
    raise EnvironmentError(
        " No API key found!\n"
        " Add OPENROUTER_API_KEY (recommended) or OPENAI_API_KEY to .env"
    )

provider = "OpenRouter" if openrouter_key else "OpenAI"
print(" Environment loaded")
print(f" Provider: {provider}")
print(f" Project root: {project_root}")

 Environment loaded
 Provider: OpenRouter
 Project root: c:\Development\real-estate-intelligence-platform


### Load Configuration

In [5]:
from context_engineering.config import (
    validate, dump, CRAWL_OUT_DIR, MARKDOWN_DIR
)

# Validate and display config
try:
    validate()
    dump()
except Exception as e:
    print(f"  Config note: {e}")

# Ensure directories exist
MARKDOWN_DIR.mkdir(parents=True, exist_ok=True)

print(f"\n Output directories ready:")
print(f"   - Markdown: {MARKDOWN_DIR}")
print(f"   - JSONL: {CRAWL_OUT_DIR}")


CONFIGURATION (NON-SECRETS ONLY)

 Provider:
   Provider: openrouter
   Model Tier: general
   Chat Model: openai/gpt-4o-mini
   Embedding Model: openai/text-embedding-3-large

 Directories:
   Data Root: c:\Development\real-estate-intelligence-platform\data
   Vector Store: c:\Development\real-estate-intelligence-platform\data\vectorstore
   Markdown: c:\Development\real-estate-intelligence-platform\data\primelands_markdown
   Cache: c:\Development\real-estate-intelligence-platform\data\cag_cache

 Chunking:
   Fixed Size: 800 tokens
   Fixed Overlap: 100 tokens
   Sliding Window: 512 tokens
   Sliding Stride: 256 tokens
   Parent-Child: 250 → 1200 tokens
   Late Chunk: 1000 → 300 tokens

 Retrieval:
   Top-K Results: 4
   Similarity Threshold: 0.7

 CAG:
   Cache TTL: 86400s
   Max Cache Size: 1000

 CRAG:
   Confidence Threshold: 0.6
   Expanded K: 8



 Output directories ready:
   - Markdown: c:\Development\real-estate-intelligence-platform\data\primelands_markdown
   - JSONL: c: