In [23]:
# 02-preprocess-chunk.ipynb
# This file contains the notebook-equivalent cells for preprocessing and chunking
# of cleaned Wikipedia text into retrieval-ready chunks.
# Paste into a Jupyter cell or save as a .py and run sections sequentially.


# ----------------------------
# Cell 0 - Header / Notes
# ----------------------------
# Notebook: 02 - Preprocess & Chunk
# Goal: clean processed Wikipedia text and chunk it into retrieval-sized
# pieces. Output: data/processed/chunks.jsonl and data/processed/chunks_preview.csv

In [12]:
# ----------------------------
# Cell 1 - Setup folders & paths
# ----------------------------
from pathlib import Path
import os
ROOT = Path.cwd().parent
RAW_TEXT_DIR = ROOT / "data" / "raw" / "text" # or use data/processed if you saved cleaned text there
PROCESSED_DIR = ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
print("ROOT:", ROOT)
print("RAW_TEXT_DIR:", RAW_TEXT_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)

ROOT: /Users/jaydobariya/Desktop/RAG Project
RAW_TEXT_DIR: /Users/jaydobariya/Desktop/RAG Project/data/raw/text
PROCESSED_DIR: /Users/jaydobariya/Desktop/RAG Project/data/processed


In [13]:
# ----------------------------
# Cell 2 - Cleaning & chunking helpers
# ----------------------------
import re, json

def clean_wikipedia_text(text):
    """
    Light cleaning for Wikipedia extracts: remove citation brackets,
    remove citation-needed tags, cut off reference/external links sections,
    collapse whitespace.
    """
    if not text:
        return ""
    # remove bracketed citations like [1], [12], [a]
    text = re.sub(r'\[\s*[0-9a-zA-Z.,:;–—#\s-]{1,30}\s*\]', '', text)
    text = re.sub(r'\(citation needed\)', '', text, flags=re.IGNORECASE)
    # cut off common trailing sections
    cut_markers = ["== References ==", "==References==", "== External links ==", "==External links==",
                   "== See also ==", "==See also==", "== Notes ==", "==Notes==", "== Further reading =="]
    for marker in cut_markers:
        idx = text.find(marker)
        if idx != -1:
            text = text[:idx]
    text = re.sub(r'=+', ' ', text)  # remove leftover heading markers
    text = text.replace("&nbsp;", " ").replace("&amp;", "&")
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()


def chunk_text_by_word_count(text, max_words=220, min_words=60):
    """
    Chunk text into pieces with up to max_words. Try to split at paragraph
    boundaries when possible. Return list of chunk strings.
    """
    if not text:
        return []
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    chunks = []
    cur = []
    cur_count = 0
    for p in paragraphs:
        words = p.split()
        wcount = len(words)
        if cur_count + wcount <= max_words:
            cur.append(p)
            cur_count += wcount
        else:
            if cur_count == 0 and wcount > max_words:
                # split a long paragraph into windows
                i = 0
                while i < len(words):
                    part = words[i:i+max_words]
                    chunks.append(' '.join(part))
                    i += max_words
                cur = []
                cur_count = 0
            else:
                if cur:
                    chunk_text = '\n\n'.join(cur).strip()
                    if len(chunk_text.split()) >= min_words:
                        chunks.append(chunk_text)
                    else:
                        if chunks:
                            chunks[-1] += '\n\n' + chunk_text
                        else:
                            chunks.append(chunk_text)
                cur = [p]
                cur_count = wcount
    # finalize remainder
    if cur:
        chunk_text = '\n\n'.join(cur).strip()
        if chunks and len(chunk_text.split()) < min_words:
            chunks[-1] += '\n\n' + chunk_text
        else:
            chunks.append(chunk_text)
    return chunks

In [17]:
# ----------------------------
# Cell 3 - Build image map from metadata (optional)
# ----------------------------
meta_dir = ROOT / "data" / "meta"
image_map = {}
if meta_dir.exists():
    for mf in meta_dir.glob("*.jsonl"):
        title = mf.stem
        recs = []
        with open(mf, encoding="utf8") as f:
            for line in f:
                if not line.strip():
                    continue
                try:
                    r = json.loads(line)
                except Exception:
                    continue
                if r.get("type") == "image":
                    recs.append(r)
        image_map[title] = recs
print("Loaded image metadata for", len(image_map), "monuments")

Loaded image metadata for 3 monuments


In [19]:
# ----------------------------
# Cell 4 - Create chunks.jsonl and preview CSV
# ----------------------------
from pathlib import Path
import csv

RAW_TEXT_SRC = ROOT / "data" / "raw" / "text"
OUT_JSONL = ROOT / "data" / "processed" / "chunks.jsonl"
OUT_CSV = ROOT / "data" / "processed" / "chunks_preview.csv"

max_words = 220
min_words = 60

count = 0
preview = []
OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)

with open(OUT_JSONL, "w", encoding="utf8") as outf:
    for txt in sorted(RAW_TEXT_SRC.glob("*.txt")):
        title = txt.stem
        raw = txt.read_text(encoding="utf8")
        cleaned = clean_wikipedia_text(raw)
        chunks = chunk_text_by_word_count(cleaned, max_words=max_words, min_words=min_words)
        imgs = image_map.get(title, [])
        img_refs = [{"local_path": r.get("local_path"), "caption": r.get("caption"), "image_url": r.get("image_url")} for r in imgs]
        for i, c in enumerate(chunks):
            rec = {
                "id": f"{title}__{i}",
                "title": title,
                "text": c,
                "word_count": len(c.split()),
                "source_file": str(txt),
                "images": img_refs,
                "chunk_index": i
            }
            outf.write(json.dumps(rec, ensure_ascii=False) + "\n")
            count += 1
            if len(preview) < 200:
                preview.append({"id": rec["id"], "title": title, "word_count": rec["word_count"], "snippet": c[:300].replace("\n"," ") + ("..." if len(c)>300 else "")})
print("Wrote", count, "chunks to", OUT_JSONL)

with open(OUT_CSV, "w", newline="", encoding="utf8") as cf:
    writer = csv.DictWriter(cf, fieldnames=["id","title","word_count","snippet"])
    writer.writeheader()
    for r in preview:
        writer.writerow(r)
print("Preview saved to", OUT_CSV)

Wrote 69 chunks to /Users/jaydobariya/Desktop/RAG Project/data/processed/chunks.jsonl
Preview saved to /Users/jaydobariya/Desktop/RAG Project/data/processed/chunks_preview.csv


In [20]:
# ----------------------------
# Cell 5 - Sanity checks / stats
# ----------------------------
import statistics

total = 0
wcs = []
titles = {}
with open(OUT_JSONL, encoding="utf8") as f:
    for line in f:
        total += 1
        rec = json.loads(line)
        wcs.append(rec.get("word_count", 0))
        titles[rec["title"]] = titles.get(rec["title"], 0) + 1

print("Total chunks:", total)
print("Chunks per title:", titles)
if wcs:
    print("Mean words:", statistics.mean(wcs), "Median:", statistics.median(wcs))

# show first chunk example
with open(OUT_JSONL, encoding="utf8") as f:
    first = json.loads(f.readline())
print('\nExample chunk id:', first['id'])
print(first['text'][:800], "...\n")

Total chunks: 69
Chunks per title: {'Qutb Minar': 14, 'Red Fort': 21, 'Taj Mahal': 34}
Mean words: 164.53623188405797 Median: 162

Example chunk id: Qutb Minar__0
The Qutb Minar, also spelled Qutub Minar and Qutab Minar, is a minaret and victory tower, built during the Delhi sultanate, and comprising the Qutb complex, a UNESCO World Heritage Site in Mehrauli, South Delhi, India. It was mostly built between 1199 and 1220, contains 399 steps, and is one of the most-frequented heritage spots in the city. After defeating Prithviraj Chauhan, the last Hindu ruler of Delhi before the Ghurid conquest of the region, Qutab-ud-din Aibak initiated the construction of the victory tower, but only managed to finish the first level. It was to mark the beginning of Islamic rule in the region. Successive dynasties of the Delhi Sultanate continued the construction, and, in 1368, Firuz Shah Tughlaq rebuilt the top parts and added a cupola.

It can be compared to the 62 ...



In [21]:
# ----------------------------
# Cell 6 - (Optional) Link best image per chunk using caption overlap
# ----------------------------
import re

def best_image_for_chunk(chunk_text, images):
    if not images:
        return None
    best = None
    best_score = -1
    words = set(re.sub(r'\W+', ' ', chunk_text.lower()).split())
    for img in images:
        cap = (img.get('caption') or '').lower()
        cap_words = set(re.sub(r'\W+', ' ', cap).split())
        score = len(words & cap_words)
        if score > best_score:
            best_score = score
            best = img
    return best if best_score > 0 else None

IN_F = OUT_JSONL
OUT_F = ROOT / "data" / "processed" / "chunks_with_image.jsonl"
linked = 0
with open(IN_F, encoding="utf8") as inf, open(OUT_F, "w", encoding="utf8") as outf:
    for line in inf:
        rec = json.loads(line)
        images = rec.get('images', [])
        best = best_image_for_chunk(rec['text'], images)
        rec['best_image'] = best
        if best:
            linked += 1
        outf.write(json.dumps(rec, ensure_ascii=False) + "\n")
print('Wrote', OUT_F, '; linked', linked, 'chunks to an image (simple overlap)')

Wrote /Users/jaydobariya/Desktop/RAG Project/data/processed/chunks_with_image.jsonl ; linked 69 chunks to an image (simple overlap)


In [22]:
# ----------------------------
# Cell 7 - Next steps note
# ----------------------------
# After this run:
# - you should have data/processed/chunks.jsonl and chunks_preview.csv
# - next: run embeddings notebook to compute embeddings and build FAISS index
# - the optional chunks_with_image.jsonl can help connect chunks to representative images

print('02-preprocess-chunk script complete.')


02-preprocess-chunk script complete.
