In [1]:
#  Setup & Installations
import sys

# if "google.colab" in sys.modules or True:
#     print(" Installing required packages...")
#     %pip install -q langchain>=0.1.0 langchain-openai>=0.0.5 langchain-community>=0.0.20 langchain-text-splitters>=0.2.0 chromadb>=0.4.0 tiktoken>=0.5.0 python-dotenv>=1.0.0

print(" Packages ready")

 Packages ready


In [3]:
#  Imports & Environment Setup
import os
import sys
import json
import random
from pathlib import Path
from dotenv import load_dotenv

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

# Load environment
load_dotenv(project_root / ".env")

# Check for API key (OpenRouter preferred, OpenAI as fallback)
openrouter_key = os.getenv("OPENROUTER_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

if not openrouter_key and not openai_key:
    raise EnvironmentError(
        "   No API key found!\n"
        "   Add OPENROUTER_API_KEY (recommended) or OPENAI_API_KEY to .env"
    )

# Load configuration
from context_engineering.config import (
    CRAWL_OUT_DIR, VECTOR_DIR, EMBEDDING_MODEL, PROVIDER
)

random.seed(42)

provider = "OpenRouter" if openrouter_key else "OpenAI"
print(" Environment loaded")
print(f" Provider: {provider}")
print(f" Project root: {project_root}")

 Environment loaded
 Provider: OpenRouter
 Project root: c:\Development\real-estate-intelligence-platform


### Import Chunking Services

Using chunking functions from application layer

In [4]:
#  Import Chunking Services
from context_engineering.application.ingest_documents_service import (
    semantic_chunk,
    fixed_chunk,
    sliding_chunk, 
    parent_child_chunk,
    late_chunk_index
)

print(" Chunking services loaded from service layer")
print(" Location: context_engineering.application.ingest_documents_service.chunkers")
print("\n Available strategies:")
print("   1. semantic_chunk  - Split by heading structure")
print("   2. fixed_chunk     - Uniform 800-token chunks with overlap")
print("   3. sliding_chunk   - Overlapping windows for better recall")
print("   4. parent_child_chunk - Chunk with parent-child relationships")
print("   5. late_chunk_index - Chunk with late indexing")

 Chunking services loaded from service layer
 Location: context_engineering.application.ingest_documents_service.chunkers

 Available strategies:
   1. semantic_chunk  - Split by heading structure
   2. fixed_chunk     - Uniform 800-token chunks with overlap
   3. sliding_chunk   - Overlapping windows for better recall
   4. parent_child_chunk - Chunk with parent-child relationships
   5. late_chunk_index - Chunk with late indexing


###  Load Corpus

In [5]:
jsonl_path = CRAWL_OUT_DIR / "primelands_docs.jsonl"

if not jsonl_path.exists():
    raise FileNotFoundError(f" Corpus not found. Run 01_crawl_primelands.ipynb first.")

with open(jsonl_path, 'r', encoding='utf-8') as f:
    documents = [json.loads(line) for line in f]

print(f" Loaded {len(documents)} documents")
print(f" Total content size: {sum(len(d['content']) for d in documents):,} chars")

 Loaded 424 documents
 Total content size: 256,340 chars


### Apply Chunking Strategies

In [6]:
# Cleanup Vector Store (prevents corruption)
import shutil
import os
import stat
import time

def on_rm_error(func, path, exc_info):
    # Error handler for shutil.rmtree
    try:
        os.chmod(path, stat.S_IWRITE)
        func(path)
    except Exception:
        pass

# Try to remove existing vector store
LOCK_DETECTED = False
if VECTOR_DIR.exists():
    print(f" Attempting to clean: {VECTOR_DIR}")
    try:
        shutil.rmtree(VECTOR_DIR, onerror=on_rm_error)
        print("    Cleaned up successfully")
    except Exception as e:
        print(f"    Cleanup failed ({e})")
        # Try renaming as last resort cleanup
        try:
             backup = VECTOR_DIR.with_name(f"vectorstore_locked_{int(time.time())}")
             os.rename(VECTOR_DIR, backup)
             print(f"    Renamed locked dir to: {backup}")
        except Exception as e2:
             LOCK_DETECTED = True
             print(f"    CRITICAL LOCK: Could not delete or rename ({e2})")

if LOCK_DETECTED:
    # OVERRIDE VECTOR_DIR to use a fresh path
    print("\n  FILE LOCK DETECTED (likely opened in editor)")
    print("    Switching to a new directory to bypass lock...")
    VECTOR_DIR = VECTOR_DIR.with_name("vectorstore_v2")
    print(f"    NEW TARGET: {VECTOR_DIR}")
else:
    # Create fresh standard directory
    VECTOR_DIR.mkdir(parents=True, exist_ok=True)
    print(f" Fresh vector directory ready: {VECTOR_DIR}")

 Attempting to clean: c:\Development\real-estate-intelligence-platform\data\vectorstore
    Cleaned up successfully
 Fresh vector directory ready: c:\Development\real-estate-intelligence-platform\data\vectorstore


#### 01. Semantic Chunking

In [7]:
print(" Running semantic chunking...")
semantic_chunks = semantic_chunk(documents)

# Save
semantic_path = CRAWL_OUT_DIR / "chunks_semantic.jsonl"
with open(semantic_path, 'w', encoding='utf-8') as f:
    for chunk in semantic_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

print(f" Semantic chunking complete: {len(semantic_chunks)} chunks")
print(f" Saved to: {semantic_path}")

 Running semantic chunking...
 Semantic chunking complete: 414 chunks
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_semantic.jsonl


#### 02. Fixed-Window Chunking

In [8]:
print(" Running fixed-window chunking...")
fixed_chunks = fixed_chunk(documents)

# Save
fixed_path = CRAWL_OUT_DIR / "chunks_fixed.jsonl"
with open(fixed_path, 'w', encoding='utf-8') as f:
    for chunk in fixed_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

avg_tokens = sum(c['token_count'] for c in fixed_chunks) / len(fixed_chunks) if fixed_chunks else 0
print(f" Fixed chunking complete: {len(fixed_chunks)} chunks")
print(f" Avg token count: {avg_tokens:.1f}")
print(f" Saved to: {fixed_path}")

 Running fixed-window chunking...
 Fixed chunking complete: 424 chunks
 Avg token count: 213.5
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_fixed.jsonl


#### 03. Sliding-Window Chunking

In [10]:
print(" Running sliding-window chunking...")
sliding_chunks = sliding_chunk(documents)

# Save
sliding_path = CRAWL_OUT_DIR / "chunks_sliding.jsonl"
with open(sliding_path, 'w', encoding='utf-8') as f:
    for chunk in sliding_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

print(f" Sliding chunking complete: {len(sliding_chunks)} chunks")
print(f" Saved to: {sliding_path}")

 Running sliding-window chunking...
 Sliding chunking complete: 424 chunks
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_sliding.jsonl


#### 04. Parent-Child Chunking

#### 05. Late Chunking