### Setup & Installations

In [1]:
import sys
# if "google.colab" in sys.modules or True:
#     print(" Installing required packages...")
#     %pip install -q playwright>=1.40.0 python-dotenv>=1.0.0 beautifulsoup4>=4.12.0 markdownify>=0.11.6 nest-asyncio>=1.5.0
    
#     # Install Playwright browsers
#     print(" Installing Playwright browsers...")
#     import subprocess
#     subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True, capture_output=True)

# print(" Packages ready")

### Imports & Environment Setup

In [2]:
import os
import sys
import json
import time
from pathlib import Path
from urllib.parse import urlparse
from dotenv import load_dotenv
import nest_asyncio

# Enable nested asyncio
nest_asyncio.apply()

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

# Load environment
load_dotenv(project_root / ".env")

# Check for API key (OpenRouter preferred, OpenAI as fallback)
openrouter_key = os.getenv("OPENROUTER_API_KEY")
groq_key = os.getenv("GROQ_API_KEY")

if not openrouter_key and not groq_key:
    raise EnvironmentError(
        " No API key found!\n"
        " Add OPENROUTER_API_KEY (recommended) or GROQ_API_KEY to .env"
    )

provider = "OpenRouter" if openrouter_key else "OpenAI"
print(" Environment loaded")
print(f" Provider: {provider}")
print(f" Project root: {project_root}")

 Environment loaded
 Provider: OpenRouter
 Project root: c:\Development\real-estate-intelligence-platform


### Load Configuration

In [4]:
from context_engineering.config import (
    validate, dump, CRAWL_OUT_DIR, MARKDOWN_DIR
)

# Validate and display config
try:
    validate()
    dump()
except Exception as e:
    print(f"  Config note: {e}")

# Ensure directories exist
MARKDOWN_DIR.mkdir(parents=True, exist_ok=True)

print(f"\n Output directories ready:")
print(f"   - Markdown: {MARKDOWN_DIR}")
print(f"   - JSONL: {CRAWL_OUT_DIR}")


CONFIGURATION (NON-SECRETS ONLY)

 Provider:
   Provider: openrouter
   Model Tier: general
   Chat Model: openai/gpt-4o-mini
   Embedding Model: openai/text-embedding-3-large

 Directories:
   Data Root: c:\Development\real-estate-intelligence-platform\data
   Vector Store: c:\Development\real-estate-intelligence-platform\data\vectorstore
   Markdown: c:\Development\real-estate-intelligence-platform\data\primelands_markdown
   Cache: c:\Development\real-estate-intelligence-platform\data\cag_cache

 Chunking:
   Fixed Size: 800 tokens
   Fixed Overlap: 100 tokens
   Sliding Window: 512 tokens
   Sliding Stride: 256 tokens
   Parent-Child: 250 → 1200 tokens
   Late Chunk: 1000 → 300 tokens

 Retrieval:
   Top-K Results: 4
   Similarity Threshold: 0.7

 CAG:
   Cache TTL: 86400s
   Max Cache Size: 1000

 CRAG:
   Confidence Threshold: 0.6
   Expanded K: 8



 Output directories ready:
   - Markdown: c:\Development\real-estate-intelligence-platform\data\primelands_markdown
   - JSONL: c:

In [None]:
# Crawl Configuration
BASE_URL = "https://www.primelands.lk"

START_PATHS = [
    "/", "/portfolio-property", "/land", "/house",
    "/apartment/ongoing", "/upcoming-projects", "/about-us", "sell-your-land",
    "/contact-us", "/services", "/testimonial",
    "/careers", "/news",
    "/online-publications", "/blog", "/virtual-tour", "/kyc", "/terms-and-conditions"
    "/privacy-policy"
]

START_URLS = [BASE_URL + path for path in START_PATHS]

EXCLUDE_PATTERNS = [
    "/login", "/terms", "/privacy", "/admin",
    "/images/", "/downloads/", "/media/", "/sin" , "/tam", "/uploads/", "/downloads/", "/pdf/",
    ".webp", ".mp4", ".jpg", ".jpeg", ".png", ".css", ".js"
]

MAX_DEPTH = 3
REQUEST_DELAY = 2.0
JSONL_PATH = CRAWL_OUT_DIR / "primelands_docs.jsonl"

print(f" Crawl config:")
print(f"   - Start URLs: {len(START_URLS)}")
print(f"   - Max depth: {MAX_DEPTH}")
print(f"   - Request delay: {REQUEST_DELAY}s")

 Crawl config:
   - Start URLs: 18
   - Max depth: 3
   - Request delay: 2.0s


### Import Crawler Service

Using `PrimeLandCrawler` from application layer 

In [6]:
# Import Web Crawler Service
from context_engineering.application.ingest_documents_service import PrimeLandWebCrawler

print(" PrimeLandWebCrawler loaded from service layer")
print(" Location: context_engineering.application.ingest_documents_service.web_crawler")

 PrimeLandWebCrawler loaded from service layer
 Location: context_engineering.application.ingest_documents_service.web_crawler


In [7]:
# Run Crawl with Service
import asyncio
from concurrent.futures import ThreadPoolExecutor

start_time = time.time()

# Initialize crawler service
crawler = PrimeLandWebCrawler(
    base_url=BASE_URL,
    max_depth=MAX_DEPTH,
    exclude_patterns=EXCLUDE_PATTERNS
)

def run_crawler_thread():
    # 1. Set Proactor policy (required for Playwright on Windows)
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
    
    # 2. Create a new event loop for this thread
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    
    try:
        # 3. Process the async crawl directly on this loop
        return loop.run_until_complete(crawler.crawl_async(START_URLS, request_delay=REQUEST_DELAY))
    finally:
        loop.close()

print(f"\n Starting crawl at {time.strftime('%H:%M:%S')}\n")

# Run in a separate thread to avoid conflict with Jupyter's running loop
with ThreadPoolExecutor(max_workers=1) as executor:
    future = executor.submit(run_crawler_thread)
    documents = future.result()

elapsed = time.time() - start_time
print(f"\n Crawl complete in {elapsed:.1f}s")
print(f" Documents collected: {len(documents)}")
print(f" URLs visited: {len(crawler.visited)}")


 Starting crawl at 22:20:38

 [0] https://www.primelands.lk/
    Saved (554 chars, 45 links found)
    Added 45 new URLs to queue (depth 1)
   Progress: 1 docs saved, 1 visited, 62 in queue
 [0] https://www.primelands.lk/portfolio-property
    Saved (124 chars, 0 links found)
   Progress: 2 docs saved, 2 visited, 61 in queue
 [0] https://www.primelands.lk/land
    Saved (564 chars, 70 links found)
    Added 56 new URLs to queue (depth 1)
   Progress: 3 docs saved, 3 visited, 116 in queue
 [0] https://www.primelands.lk/house
    Saved (566 chars, 30 links found)
    Added 16 new URLs to queue (depth 1)
   Progress: 4 docs saved, 4 visited, 131 in queue
 [0] https://www.primelands.lk/apartment/ongoing
    Saved (124 chars, 0 links found)
   Progress: 5 docs saved, 5 visited, 130 in queue
 [0] https://www.primelands.lk/upcoming-projects
    Saved (124 chars, 0 links found)
   Progress: 6 docs saved, 6 visited, 129 in queue
 [0] https://www.primelands.lk/about-us
    Saved (124 chars, 0 l

### Save Outputs

In [8]:

# Save markdown files
for i, doc in enumerate(documents):
    url_path = urlparse(doc['url']).path.strip('/').replace('/', '_')
    if not url_path:
        url_path = "homepage"
    filename = f"{i:03d}_{url_path}.md"
    
    md_file = MARKDOWN_DIR / filename
    with open(md_file, 'w', encoding='utf-8') as f:
        f.write(f"# {doc['title']}\n\n")
        f.write(f"**URL**: {doc['url']}\n\n")
        f.write(f"**Depth**: {doc['depth_level']}\n\n")
        f.write("---\n\n")
        f.write(doc['content'])

print(f" Saved {len(documents)} markdown files to {MARKDOWN_DIR}")

# Save JSONL
with open(JSONL_PATH, 'w', encoding='utf-8') as f:
    for doc in documents:
        f.write(json.dumps(doc, ensure_ascii=False) + '\n')

print(f" Saved JSONL corpus to {JSONL_PATH}")

 Saved 714 markdown files to c:\Development\real-estate-intelligence-platform\data\primelands_markdown
 Saved JSONL corpus to c:\Development\real-estate-intelligence-platform\data\primelands_docs.jsonl


### Quality Checks

In [9]:
import random

print(" Quality Checks:\n")

# Check markdown files
md_files = list(MARKDOWN_DIR.glob("*.md"))
print(f"1  Markdown files: {len(md_files)}")

if len(md_files) >= 20:
    print(f"    Good! Got {len(md_files)} pages")
elif len(md_files) >= 10:
    print(f"     Only {len(md_files)} pages (site may be small)")
else:
    raise AssertionError(f" Too few pages: {len(md_files)}")

# Check JSONL
assert JSONL_PATH.exists(), f" JSONL not found"
print(f"\n2  JSONL file: {JSONL_PATH.stat().st_size:,} bytes")

# Sample inspection
with open(JSONL_PATH, 'r', encoding='utf-8') as f:
    all_docs = [json.loads(line) for line in f]

samples = random.sample(all_docs, min(3, len(all_docs)))
print(f"\n3  Random samples:\n")

for i, doc in enumerate(samples, 1):
    print(f"   Sample {i}:")
    print(f"   - URL: {doc['url']}")
    print(f"   - Title: {doc['title']}")
    print(f"   - Words: {len(doc['content'].split())}")
    print()

print(" All quality checks passed!")

 Quality Checks:

1  Markdown files: 714
    Good! Got 714 pages

2  JSONL file: 2,724,992 bytes

3  Random samples:

   Sample 1:
   - URL: https://www.primelands.lk/portfolio-property/city/Anuradhapura/tam
   - Title: Land Sales In Sri Lanka | Prime Lands
   - Words: 14

   Sample 2:
   - URL: https://www.primelands.lk/land/district/Matara/sin
   - Title: Land for sale in Matara | Prime Lands
   - Words: 18

   Sample 3:
   - URL: https://www.primelands.lk/land/city/Minuwangoda/en
   - Title: Minuwangoda Lands in Sri Lanka
   - Words: 14

 All quality checks passed!
