In [1]:
#  Setup & Installations
import sys

# if "google.colab" in sys.modules or True:
#     print(" Installing required packages...")
#     %pip install -q langchain>=0.1.0 langchain-openai>=0.0.5 langchain-community>=0.0.20 langchain-text-splitters>=0.2.0 chromadb>=0.4.0 tiktoken>=0.5.0 python-dotenv>=1.0.0

print(" Packages ready")

 Packages ready


In [3]:
#  Imports & Environment Setup
import os
import sys
import json
import random
from pathlib import Path
from dotenv import load_dotenv

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / "src"))

# Load environment
load_dotenv(project_root / ".env")

# Check for API key (OpenRouter preferred, OpenAI as fallback)
openrouter_key = os.getenv("OPENROUTER_API_KEY")
openai_key = os.getenv("OPENAI_API_KEY")

if not openrouter_key and not openai_key:
    raise EnvironmentError(
        "   No API key found!\n"
        "   Add OPENROUTER_API_KEY (recommended) or OPENAI_API_KEY to .env"
    )

# Load configuration
from context_engineering.config import (
    CRAWL_OUT_DIR, VECTOR_DIR, EMBEDDING_MODEL, PROVIDER
)

random.seed(42)

provider = "OpenRouter" if openrouter_key else "OpenAI"
print(" Environment loaded")
print(f" Provider: {provider}")
print(f" Project root: {project_root}")

 Environment loaded
 Provider: OpenRouter
 Project root: c:\Development\real-estate-intelligence-platform


### Import Chunking Services

Using chunking functions from application layer

In [4]:
#  Import Chunking Services
from context_engineering.application.ingest_documents_service import (
    semantic_chunk,
    fixed_chunk,
    sliding_chunk, 
    parent_child_chunk,
    late_chunk_index
)

print(" Chunking services loaded from service layer")
print(" Location: context_engineering.application.ingest_documents_service.chunkers")
print("\n Available strategies:")
print("   1. semantic_chunk  - Split by heading structure")
print("   2. fixed_chunk     - Uniform 800-token chunks with overlap")
print("   3. sliding_chunk   - Overlapping windows for better recall")
print("   4. parent_child_chunk - Chunk with parent-child relationships")
print("   5. late_chunk_index - Chunk with late indexing")

 Chunking services loaded from service layer
 Location: context_engineering.application.ingest_documents_service.chunkers

 Available strategies:
   1. semantic_chunk  - Split by heading structure
   2. fixed_chunk     - Uniform 800-token chunks with overlap
   3. sliding_chunk   - Overlapping windows for better recall
   4. parent_child_chunk - Chunk with parent-child relationships
   5. late_chunk_index - Chunk with late indexing


###  Load Corpus

In [5]:
jsonl_path = CRAWL_OUT_DIR / "primelands_docs.jsonl"

if not jsonl_path.exists():
    raise FileNotFoundError(f" Corpus not found. Run 01_crawl_primelands.ipynb first.")

with open(jsonl_path, 'r', encoding='utf-8') as f:
    documents = [json.loads(line) for line in f]

print(f" Loaded {len(documents)} documents")
print(f" Total content size: {sum(len(d['content']) for d in documents):,} chars")

 Loaded 424 documents
 Total content size: 256,340 chars


### Apply Chunking Strategies

In [6]:
# Cleanup Vector Store (prevents corruption)
import shutil
import os
import stat
import time

def on_rm_error(func, path, exc_info):
    # Error handler for shutil.rmtree
    try:
        os.chmod(path, stat.S_IWRITE)
        func(path)
    except Exception:
        pass

# Try to remove existing vector store
LOCK_DETECTED = False
if VECTOR_DIR.exists():
    print(f" Attempting to clean: {VECTOR_DIR}")
    try:
        shutil.rmtree(VECTOR_DIR, onerror=on_rm_error)
        print("    Cleaned up successfully")
    except Exception as e:
        print(f"    Cleanup failed ({e})")
        # Try renaming as last resort cleanup
        try:
             backup = VECTOR_DIR.with_name(f"vectorstore_locked_{int(time.time())}")
             os.rename(VECTOR_DIR, backup)
             print(f"    Renamed locked dir to: {backup}")
        except Exception as e2:
             LOCK_DETECTED = True
             print(f"    CRITICAL LOCK: Could not delete or rename ({e2})")

if LOCK_DETECTED:
    # OVERRIDE VECTOR_DIR to use a fresh path
    print("\n  FILE LOCK DETECTED (likely opened in editor)")
    print("    Switching to a new directory to bypass lock...")
    VECTOR_DIR = VECTOR_DIR.with_name("vectorstore_v2")
    print(f"    NEW TARGET: {VECTOR_DIR}")
else:
    # Create fresh standard directory
    VECTOR_DIR.mkdir(parents=True, exist_ok=True)
    print(f" Fresh vector directory ready: {VECTOR_DIR}")

 Attempting to clean: c:\Development\real-estate-intelligence-platform\data\vectorstore
    Cleaned up successfully
 Fresh vector directory ready: c:\Development\real-estate-intelligence-platform\data\vectorstore


#### 01. Semantic Chunking

In [7]:
print(" Running semantic chunking...")
semantic_chunks = semantic_chunk(documents)

# Save
semantic_path = CRAWL_OUT_DIR / "chunks_semantic.jsonl"
with open(semantic_path, 'w', encoding='utf-8') as f:
    for chunk in semantic_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

print(f" Semantic chunking complete: {len(semantic_chunks)} chunks")
print(f" Saved to: {semantic_path}")

 Running semantic chunking...
 Semantic chunking complete: 414 chunks
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_semantic.jsonl


#### 02. Fixed-Window Chunking

In [8]:
print(" Running fixed-window chunking...")
fixed_chunks = fixed_chunk(documents)

# Save
fixed_path = CRAWL_OUT_DIR / "chunks_fixed.jsonl"
with open(fixed_path, 'w', encoding='utf-8') as f:
    for chunk in fixed_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

avg_tokens = sum(c['token_count'] for c in fixed_chunks) / len(fixed_chunks) if fixed_chunks else 0
print(f" Fixed chunking complete: {len(fixed_chunks)} chunks")
print(f" Avg token count: {avg_tokens:.1f}")
print(f" Saved to: {fixed_path}")

 Running fixed-window chunking...
 Fixed chunking complete: 424 chunks
 Avg token count: 213.5
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_fixed.jsonl


#### 03. Sliding-Window Chunking

In [9]:
print(" Running sliding-window chunking...")
sliding_chunks = sliding_chunk(documents)

# Save
sliding_path = CRAWL_OUT_DIR / "chunks_sliding.jsonl"
with open(sliding_path, 'w', encoding='utf-8') as f:
    for chunk in sliding_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

print(f" Sliding chunking complete: {len(sliding_chunks)} chunks")
print(f" Saved to: {sliding_path}")

 Running sliding-window chunking...
 Sliding chunking complete: 424 chunks
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_sliding.jsonl


#### 04. Parent-Child Chunking

In [10]:
print(" Running Parent Child Chunking...")
child_chunks, parent_chunks = parent_child_chunk(documents)
parent_child_chunks = child_chunks + parent_chunks

# Save
parent_child_path = CRAWL_OUT_DIR / "chunks_parent_child.jsonl"
with open(parent_child_path, 'w', encoding='utf-8') as f:
    for chunk in parent_child_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

avg_tokens = sum(c['token_count'] for c in parent_child_chunks) / len(parent_child_chunks) if parent_child_chunks else 0
print(f" Parent child chunking complete: {len(parent_child_chunks)} chunks")
print(f" Avg token count: {avg_tokens:.1f}")
print(f" Saved to: {parent_child_path}")


 Running Parent Child Chunking...
 Parent child chunking complete: 848 chunks
 Avg token count: 213.5
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_parent_child.jsonl


#### 05. Late Chunking

In [11]:
print(" Running Late Chunking...\n")
late_chunks = late_chunk_index(documents)

# Save
late_chunks_path = CRAWL_OUT_DIR / "chunks_late.jsonl"
with open(late_chunks_path, 'w', encoding='utf-8') as f:
    for chunk in late_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + '\n')

avg_tokens = sum(c.get('token_count', 0) for c in late_chunks) / len(late_chunks) if late_chunks else 0
print(f" Late chunking complete: {len(late_chunks)} base passages")
print(f" Avg token count: {avg_tokens:.1f}")
print(f" Saved to: {late_chunks_path}\n")


 Running Late Chunking...

 Late chunking complete: 424 base passages
 Avg token count: 213.5
 Saved to: c:\Development\real-estate-intelligence-platform\data\chunks_late.jsonl



#### Spot-Check Samples

In [12]:
print(" Spot-Check: 2 samples from each strategy\n")

def print_sample(chunk, strategy_name):
    print(f"**{strategy_name}** chunk:")
    print(f"  URL: {chunk['url']}")
    print(f"  Strategy: {chunk['strategy']}")
    print(f"  Text length: {len(chunk['text'])} chars")
    print(f"  Preview: {chunk['text'][:100]}...")
    print()

print("=" * 60)
print("SEMANTIC SAMPLES")
print("=" * 60)
for chunk in random.sample(semantic_chunks, min(2, len(semantic_chunks))):
    print_sample(chunk, "Semantic")

print("=" * 60)
print("FIXED-WINDOW SAMPLES")
print("=" * 60)
for chunk in random.sample(fixed_chunks, min(2, len(fixed_chunks))):
    print_sample(chunk, "Fixed")

print("=" * 60)
print("SLIDING-WINDOW SAMPLES")
print("=" * 60)
for chunk in random.sample(sliding_chunks, min(2, len(sliding_chunks))):
    print_sample(chunk, "Sliding")

print("=" * 60)
print("PARENT-CHILD SAMPLES")
print("=" * 60)
for chunk in random.sample(parent_child_chunks, min(2, len(child_chunks))):
    print_sample(chunk, "Parent-Child")

print("=" * 60)
print("LATE-CHUNKING SAMPLES")
print("=" * 60)
for chunk in random.sample(late_chunks, min(2, len(late_chunks))):
    print_sample(chunk, "Late-Chunking")

 Spot-Check: 2 samples from each strategy

SEMANTIC SAMPLES
**Semantic** chunk:
  URL: https://www.primelands.lk/land/WOODLAND-ESTATE-IV-KURUNEGALA/en
  Strategy: semantic
  Text length: 630 chars
  Preview: [![primelogo.png](https://www.primelands.lk/public/assets/images/primelogo.png)](https://www.primela...

**Semantic** chunk:
  URL: https://www.primelands.lk/close-to-ancient-city-lands/en
  Strategy: semantic
  Text length: 616 chars
  Preview: [![primelogo.png](https://www.primelands.lk/public/assets/images/primelogo.png)](https://www.primela...

FIXED-WINDOW SAMPLES
**Fixed** chunk:
  URL: https://www.primelands.lk/kyc
  Strategy: fixed
  Text length: 124 chars
  Preview: # 404

## We are sorry, Page not found!

The page you are looking for might have been removed or is ...

**Fixed** chunk:
  URL: https://www.primelands.lk/land/ELINOR-BATTARAMULLA/en
  Strategy: fixed
  Text length: 604 chars
  Preview: [![primelogo.png](https://www.primelands.lk/public/assets/images/primelogo.

### 06. Qdrant Indexing
Persistent index created using Qdrant. All 5 collections populated with embeddings and rich metadata.

In [13]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_openai import OpenAIEmbeddings
import uuid
import time
from context_engineering.config import DATA_DIR, OPENROUTER_BASE_URL, EMBEDDING_MODEL

print(" Initializing embeddings via OpenRouter...")
embeddings = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=openrouter_key,
    openai_api_base=OPENROUTER_BASE_URL
)

# Get vector size from a test embedding
sample_vector = embeddings.embed_query("Real estate test scenario.")
vector_size = len(sample_vector)
print(f" Vector dimension: {vector_size}")

db_path = str(DATA_DIR / "qdrant_db")
print(f" Initializing persistent Qdrant at: {db_path}")
client = QdrantClient(path=db_path)

collections = [
    "semantic_chunks",
    "fixed_chunks",
    "sliding_chunks",
    "parent_child_chunks",
    "late_chunks"
]

files = [
    "chunks_semantic.jsonl",
    "chunks_fixed.jsonl",
    "chunks_sliding.jsonl",
    "chunks_parent_child.jsonl",
    "chunks_late.jsonl"
]

# Create collections
for col in collections:
    if client.collection_exists(col):
        client.delete_collection(col)
    client.create_collection(
        collection_name=col,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )
print(" Collections created.")


 Initializing embeddings via OpenRouter...
 Vector dimension: 3072
 Initializing persistent Qdrant at: c:\Development\real-estate-intelligence-platform\data\qdrant_db
 Collections created.


In [14]:
def index_strategy(collection_name, filename):
    filepath = CRAWL_OUT_DIR / filename
    if not filepath.exists():
        print(f" File not found: {filename}")
        return
        
    documents = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            documents.append(json.loads(line))
            
    print(f" Indexing {len(documents)} points into {collection_name}...")
    
    batch_size = 100
    points = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        texts = [doc.get('text', '') for doc in batch]
        
        try:
            batch_embeddings = embeddings.embed_documents(texts)
        except Exception as e:
            print(f" Embedding error: {e}")
            continue
            
        for doc, vector in zip(batch, batch_embeddings):
            points.append(
                PointStruct(
                    id=str(uuid.uuid4()),
                    vector=vector,
                    payload=doc
                )
            )
            
        if len(points) >= 500:
            client.upsert(collection_name=collection_name, points=points)
            points = []
            
    if points:
        client.upsert(collection_name=collection_name, points=points)
        
    out_info = client.get_collection(collection_name=collection_name)
    print(f" ‚úì Indexed {out_info.points_count} points in {collection_name}")

for col, file in zip(collections, files):
    index_strategy(col, file)


 Indexing 414 points into semantic_chunks...
 ‚úì Indexed 2070 points in semantic_chunks
 Indexing 424 points into fixed_chunks...
 ‚úì Indexed 2120 points in fixed_chunks
 Indexing 424 points into sliding_chunks...
 ‚úì Indexed 2120 points in sliding_chunks
 Indexing 848 points into parent_child_chunks...
 ‚úì Indexed 3940 points in parent_child_chunks
 Indexing 424 points into late_chunks...
 ‚úì Indexed 1720 points in late_chunks


### 07. Comparison Metrics
Complete comparison table with chunk count, avg token size, index point count, and retrieval time for all strategies.

In [15]:
import pandas as pd

metrics = []
query = "What are the common practices in property valuation and real estate appraisal?"
query_vector = embeddings.embed_query(query)

print(" Measuring retrieval times & building comparison table...")

for col, file in zip(collections, files):
    filepath = CRAWL_OUT_DIR / file
    
    # 1. Chunk Count & Avg Token Size
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            chunk_count = len(lines)
            docs = [json.loads(line) for line in lines]
            avg_size = sum(d.get('token_count', 0) for d in docs) / chunk_count if chunk_count > 0 else 0
    except Exception:
        chunk_count = 0
        avg_size = 0.0

    # 2. Index Size (Points)
    try:
        col_info = client.get_collection(collection_name=col)
        index_size_points = col_info.points_count
    except Exception:
        index_size_points = 0
        
    # 3. Retrieval Time
    try:
        start_time = time.perf_counter()
        results = client.query_points(
            collection_name=col,
            query=query_vector,
            limit=5
        )
        end_time = time.perf_counter()
        retrieval_ms = (end_time - start_time) * 1000
    except Exception as e:
        print(f"  Retrieval error in {col}: {e}")
        retrieval_ms = 0.0
        
    metrics.append({
        "Strategy": col,
        "Total Chunks": chunk_count,
        "Avg Chunk Size (Tokens)": round(avg_size, 1),
        "Index Size (Points)": index_size_points,
        "Retrieval Time (ms)": round(retrieval_ms, 2)
    })

# Render Table
df = pd.DataFrame(metrics)
display(df)


 Measuring retrieval times & building comparison table...


Unnamed: 0,Strategy,Total Chunks,Avg Chunk Size (Tokens),Index Size (Points),Retrieval Time (ms)
0,semantic_chunks,414,0.0,2070,46.59
1,fixed_chunks,424,213.5,2120,38.7
2,sliding_chunks,424,0.0,2120,39.26
3,parent_child_chunks,848,213.5,3940,70.74
4,late_chunks,424,213.5,1720,34.67


### 08. Chunking Strategy Comparison
Evaluates 10 real-estate queries across 5 strategies. Measures Precision@5, Recall@5, Answer Relevance, and Latency. 
Results are saved to `chunking_comparison.csv` and the clear winner is identified.

In [18]:
import pandas as pd
import time
import requests
import os
import numpy as np
import re
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from langchain_openai import OpenAIEmbeddings
from context_engineering.config import DATA_DIR, OPENROUTER_BASE_URL, EMBEDDING_MODEL, get_api_key, get_chat_model

# Ensure environment is loaded
try:
    load_dotenv(project_root / ".env")
except:
    load_dotenv('.env')

openrouter_key = get_api_key('openrouter')
embeddings = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=openrouter_key,
    openai_api_base=OPENROUTER_BASE_URL
)
    # client = QdrantClient(path=str(DATA_DIR / "qdrant_db"))  # Removed to prevent portalocker unindent lock

eval_queries = [
    "What is the valuation process for commercial real estate?",
    "How does zoning affect land development potential?",
    "What are the current trends in sustainable building materials for residential homes?",
    "Explain the role of an escrow agent in a standard property transaction.",
    "How do interest rates impact mortgage affordability and housing demand?",
    "What are the primary differences between REITs and direct real estate investment?",
    "Can you define cap rate and how it is used to assess property performance?",
    "What are the key clauses to look for in a commercial lease agreement?",
    "How has the rise of remote work influenced urban vs. suburban real estate markets?",
    "What are the standard tax implications of selling an investment property (1031 exchange)?"
]

collections = [
    "semantic_chunks",
    "fixed_chunks",
    "sliding_chunks",
    "parent_child_chunks",
    "late_chunks"
]
strategies = collections

def evaluate_relevance(query, retrieved_texts):
    """LLM-as-a-judge to determine if retrieved texts contain the answer. Returns score 0.0-1.0"""
    if not retrieved_texts:
        return 0.0
        
    context = "\n\n---\n\n".join(retrieved_texts)
    prompt = f"""
    You are an expert evaluator. Assess if the provided context contains the answer to the query.
    Query: {query}
    Context: {context}
    
    Score from 0 (completely irrelevant) to 10 (perfectly answers the query). Output ONLY the number.
    """
    
    try:
        groq_key = get_api_key("groq")
        groq_model = get_chat_model(provider="groq", tier="strong")
        
        response = requests.post(
            "https://api.groq.com/openai/v1/chat/completions",
            headers={"Authorization": f"Bearer {groq_key}"},
            json={
                "model": groq_model,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0,
                "max_tokens": 10
            }
        )
        data = response.json()
        score_text = data.get("choices", [{}])[0].get("message", {}).get("content", "0").strip()
        if score_text == "0" and "error" in data:
            print(f" API Error: {data['error']}")
            
        match = re.search(r"\d+", score_text)
        score = float(match.group()) if match else 0.0
        return min(score / 10.0, 1.0)
    except Exception as e:
        print(f" Eval error: {e}")
        return 0.0

results_data = []
print(f"\n Running Evaluation Benchmarks across {len(eval_queries)} queries and {len(strategies)} strategies...")

for strategy in strategies:
    print(f"\nEvaluating: {strategy}")
    strategy_metrics = {
        "Strategy": strategy,
        "Precision@5": [],
        "Recall@5": [],
        "Answer Relevance": [],
        "Latency (ms)": []
    }
    
    for q_idx, query in enumerate(eval_queries):
        query_vector = embeddings.embed_query(query)
        
        try:
            start = time.perf_counter()
            hits = client.query_points(
                collection_name=strategy,
                query=query_vector,
                limit=5
            ).points
            latency = (time.perf_counter() - start) * 1000
        except Exception as e:
            print(f"Retrieval failed for {strategy}: {e}")
            hits = []
            latency = 0
            
        strategy_metrics["Latency (ms)"].append(latency)
        
        if not hits:
            strategy_metrics["Precision@5"].append(0.0)
            strategy_metrics["Recall@5"].append(0.0)
            strategy_metrics["Answer Relevance"].append(0.0)
            continue
            
        retrieved_texts = [hit.payload.get('text', '') for hit in hits if hit.payload]
        
        relevance_score = evaluate_relevance(query, retrieved_texts)
        strategy_metrics["Answer Relevance"].append(relevance_score)
        
        is_hit = relevance_score >= 0.7
        strategy_metrics["Precision@5"].append(1.0 if is_hit else 0.0)
        strategy_metrics["Recall@5"].append(1.0 if is_hit else 0.0)
        
    results_data.append({
        "Strategy": strategy,
        "Precision@5": round(np.mean(strategy_metrics["Precision@5"]), 3),
        "Recall@5": round(np.mean(strategy_metrics["Recall@5"]), 3),
        "Answer Relevance": round(np.mean(strategy_metrics["Answer Relevance"]), 3),
        "Latency (ms)": round(np.mean(strategy_metrics["Latency (ms)"]), 1)
    })

eval_df = pd.DataFrame(results_data)
eval_df = eval_df.sort_values(by=["Answer Relevance", "Latency (ms)"], ascending=[False, True])
winner = eval_df.iloc[0]['Strategy']

print("\n" + "="*50)
print(f"\nüèÜ CLEAR WINNER: {winner}\n")
print("="*50 + "\n")
display(eval_df)

csv_path = CRAWL_OUT_DIR / "chunking_comparison.csv"
eval_df.to_csv(csv_path, index=False)
print(f"\nResults saved to: {csv_path}")



 Running Evaluation Benchmarks across 10 queries and 5 strategies...

Evaluating: semantic_chunks

Evaluating: fixed_chunks
 API Error: {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01kh49xa0xfga852wxn5bx5df0` service tier `on_demand` on tokens per minute (TPM): Limit 12000, Used 11166, Requested 1341. Please try again in 2.535s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}
 API Error: {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01kh49xa0xfga852wxn5bx5df0` service tier `on_demand` on tokens per minute (TPM): Limit 12000, Used 11032, Requested 1169. Please try again in 1.005s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}
 API Error: {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organizati

Unnamed: 0,Strategy,Precision@5,Recall@5,Answer Relevance,Latency (ms)
4,late_chunks,0.0,0.0,0.0,30.3
1,fixed_chunks,0.0,0.0,0.0,32.3
2,sliding_chunks,0.0,0.0,0.0,32.7
0,semantic_chunks,0.0,0.0,0.0,37.6
3,parent_child_chunks,0.0,0.0,0.0,62.8



Results saved to: c:\Development\real-estate-intelligence-platform\data\chunking_comparison.csv
