# Week 01 · Crawl Nawaloka (Depth 3) → Markdown + JSONL

**Objective**: Crawl https://www.nawaloka.com to max depth 3, save as Markdown files and consolidated JSONL corpus.

**Architecture**: Uses `NawalokaWebCrawler` service from `context_engineering.application.ingest_documents_service`

**Provider Support**: Supports OpenRouter (multi-provider) or direct OpenAI API via `.env` configuration

In [None]:
# Cell 1: Setup & Installations
import sys

if "google.colab" in sys.modules or True:
    print(" Installing required packages...")
    %pip install -q playwright>=1.40.0 python-dotenv>=1.0.0 beautifulsoup4>=4.12.0 markdownify>=0.11.6 nest-asyncio>=1.5.0
    
    # Install Playwright browsers
    print(" Installing Playwright browsers...")
    import subprocess
    subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True, capture_output=True)

print(" Packages ready")

In [None]:
# Cell 2: Imports & Environment Setup
import os
import sys
import json
import time
from pathlib import Path
from urllib.parse import urlparse
from dotenv import load_dotenv
import nest_asyncio

# Enable nested asyncio
nest_asyncio.apply()

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

# Load environment
load_dotenv(project_root / ".env")

# Check for API key (OpenRouter preferred, OpenAI as fallback)
openrouter_key = os.getenv("OPENROUTER_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

if not openrouter_key and not openai_key:
    raise EnvironmentError(
        " No API key found!\n"
        " Add OPENROUTER_API_KEY (recommended) or OPENAI_API_KEY to .env"
    )

provider = "OpenRouter" if openrouter_key else "OpenAI"
print(" Environment loaded")
print(f" Provider: {provider}")
print(f" Project root: {project_root}")

In [None]:
# Cell 3: Load Configuration
from context_engineering.config import (
    validate, dump, CRAWL_OUT_DIR, MARKDOWN_DIR
)

# Validate and display config
try:
    validate()
    dump()
except Exception as e:
    print(f"  Config note: {e}")

# Ensure directories exist
MARKDOWN_DIR.mkdir(parents=True, exist_ok=True)

print(f"\n Output directories ready:")
print(f"   - Markdown: {MARKDOWN_DIR}")
print(f"   - JSONL: {CRAWL_OUT_DIR}")

## Import Crawler Service

Using `NawalokaWebCrawler` from application layer (NOT defined here!)

In [None]:
# Cell 4: Import Web Crawler Service
from context_engineering.application.ingest_documents_service import NawalokaWebCrawler

print(" NawalokaWebCrawler loaded from service layer")
print(" Location: context_engineering.application.ingest_documents_service.web_crawler")

## Crawl Configuration

In [None]:
# Cell 5: Crawl Configuration
BASE_URL = "https://www.nawaloka.com"

START_PATHS = [
    "/", "/our-centres", "/healthchecks", "/channeling",
    "/aboutus", "/contactus", "/blogs-and-news", "/emergency",
    "/international", "/our-centres/cardiology", "/our-centres/oncology",
    "/our-centres/neurology", "/our-centres/orthopedics",
    "/our-centres/pediatrics", "/healthchecks/executive"
]

START_URLS = [BASE_URL + path for path in START_PATHS]

EXCLUDE_PATTERNS = [
    "/login", "/terms", "/privacy", "/admin",
    "/images/", "/downloads/", "/media/"
]

MAX_DEPTH = 3
REQUEST_DELAY = 2.0
JSONL_PATH = CRAWL_OUT_DIR / "nawaloka_docs.jsonl"

print(f" Crawl config:")
print(f"   - Start URLs: {len(START_URLS)}")
print(f"   - Max depth: {MAX_DEPTH}")
print(f"   - Request delay: {REQUEST_DELAY}s")

## Execute Crawl

In [None]:
# Cell 6: Run Crawl with Service
start_time = time.time()

# Initialize crawler service
crawler = NawalokaWebCrawler(
    base_url=BASE_URL,
    max_depth=MAX_DEPTH,
    exclude_patterns=EXCLUDE_PATTERNS
)

print(f"\n Starting crawl at {time.strftime('%H:%M:%S')}\n")
documents = crawler.crawl(START_URLS, request_delay=REQUEST_DELAY)

elapsed = time.time() - start_time
print(f"\n Crawl complete in {elapsed:.1f}s")
print(f" Documents collected: {len(documents)}")
print(f" URLs visited: {len(crawler.visited)}")

In [None]:
# Cell 7: Save Outputs
# Save markdown files
for i, doc in enumerate(documents):
    url_path = urlparse(doc['url']).path.strip('/').replace('/', '_')
    if not url_path:
        url_path = "homepage"
    filename = f"{i:03d}_{url_path}.md"
    
    md_file = MARKDOWN_DIR / filename
    with open(md_file, 'w', encoding='utf-8') as f:
        f.write(f"# {doc['title']}\n\n")
        f.write(f"**URL**: {doc['url']}\n\n")
        f.write(f"**Depth**: {doc['depth_level']}\n\n")
        f.write("---\n\n")
        f.write(doc['content'])

print(f" Saved {len(documents)} markdown files to {MARKDOWN_DIR}")

# Save JSONL
with open(JSONL_PATH, 'w', encoding='utf-8') as f:
    for doc in documents:
        f.write(json.dumps(doc, ensure_ascii=False) + '\n')

print(f" Saved JSONL corpus to {JSONL_PATH}")

## Quality Checks

In [None]:
# Cell 8: Quality Checks
import random

print(" Quality Checks:\n")

# Check markdown files
md_files = list(MARKDOWN_DIR.glob("*.md"))
print(f"1️  Markdown files: {len(md_files)}")

if len(md_files) >= 20:
    print(f"    Good! Got {len(md_files)} pages")
elif len(md_files) >= 10:
    print(f"     Only {len(md_files)} pages (site may be small)")
else:
    raise AssertionError(f" Too few pages: {len(md_files)}")

# Check JSONL
assert JSONL_PATH.exists(), f" JSONL not found"
print(f"\n2️  JSONL file: {JSONL_PATH.stat().st_size:,} bytes")

# Sample inspection
with open(JSONL_PATH, 'r', encoding='utf-8') as f:
    all_docs = [json.loads(line) for line in f]

samples = random.sample(all_docs, min(3, len(all_docs)))
print(f"\n3️  Random samples:\n")

for i, doc in enumerate(samples, 1):
    print(f"   Sample {i}:")
    print(f"   - URL: {doc['url']}")
    print(f"   - Title: {doc['title']}")
    print(f"   - Words: {len(doc['content'].split())}")
    print()

print(" All quality checks passed!")