# 01 - Data ingest: Wikipedia + Wikimedia Commons
Goal: download raw page text and up to N images per monument, save metadata (license & captions).
Seed monuments: Taj Mahal, Qutub Minar, Red Fort, Delhi.

In [3]:
# Cell 2 — folders and monument list
from pathlib import Path
import os, json

ROOT = Path.cwd()
PARENT = ROOT.parent

RAW_TEXT = PARENT / "data" / "raw" / "text"
RAW_IMAGES = PARENT / "data" / "raw" / "images"
META = PARENT / "data" / "meta"
PROCESSED = PARENT / "data" / "processed"

for p in [RAW_TEXT, RAW_IMAGES, META, PROCESSED]:
    p.mkdir(parents=True, exist_ok=True)

# Seed monuments - edit later to add more
MONUMENTS = [
    "Taj Mahal",
    "Qutub Minar",
    "Red Fort, Delhi"
]

print("Created directories. Monuments:", MONUMENTS)
print("RAW_TEXT:", RAW_TEXT)
print("RAW_IMAGES:", RAW_IMAGES)
print("META:", META)
print("PROCESSED:", PROCESSED)


Created directories. Monuments: ['Taj Mahal', 'Qutub Minar', 'Red Fort, Delhi']
RAW_TEXT: /Users/jaydobariya/Desktop/RAG Project/data/raw/text
RAW_IMAGES: /Users/jaydobariya/Desktop/RAG Project/data/raw/images
META: /Users/jaydobariya/Desktop/RAG Project/data/meta
PROCESSED: /Users/jaydobariya/Desktop/RAG Project/data/processed


In [4]:
# Cell 3 — helpers
import requests, time
from urllib.parse import unquote
from tqdm import tqdm

USER_AGENT = "rag-monuments-bot/0.1 (your-email@example.com)"
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})

WIKI_API = "https://en.wikipedia.org/w/api.php"
COMMONS_API = "https://commons.wikimedia.org/w/api.php"
SLEEP = 0.6  # polite sleep

def sanitize_filename(s):
    return "".join(c if (c.isalnum() or c in "._- ") else "_" for c in s).strip()

def safe_get(url, params=None, timeout=20):
    r = session.get(url, params=params, timeout=timeout)
    time.sleep(SLEEP)
    r.raise_for_status()
    return r

In [5]:
# Cell 4 — fetch page text
def fetch_wikipedia_text(title):
    params = {"action":"query","prop":"extracts","explaintext":1,"titles":title,"format":"json","redirects":1}
    r = safe_get(WIKI_API, params=params)
    j = r.json()
    pages = j.get("query", {}).get("pages", {})
    for pid, pg in pages.items():
        return pg.get("title"), pg.get("extract", "")
    return title, ""


In [6]:
# Cell 5 — fetch image file titles from a wiki page
def fetch_image_file_titles(title):
    params = {"action":"query","prop":"images","titles":title,"format":"json","imlimit":"max","redirects":1}
    r = safe_get(WIKI_API, params=params)
    j = r.json()
    files = []
    pages = j.get("query", {}).get("pages", {})
    for pid, pg in pages.items():
        for im in pg.get("images", []) or []:
            if im.get("title", "").lower().startswith("file:"):
                files.append(im["title"])
    return files

In [7]:
# Cell 6 — get commons image info (url, extmetadata)
def get_commons_image_info(file_title):
    params = {"action":"query","titles":file_title,"prop":"imageinfo","iiprop":"url|extmetadata|size|mime","format":"json"}
    r = safe_get(COMMONS_API, params=params)
    j = r.json()
    for pid, pg in j.get("query", {}).get("pages", {}).items():
        if "imageinfo" in pg:
            return pg["imageinfo"][0]
    return None

def extract_caption_from_extmetadata(extmetadata):
    if not extmetadata:
        return ""
    for key in ("ImageDescription", "ObjectName", "ImageDescriptionPlain"):
        if key in extmetadata:
            v = extmetadata[key]
            if isinstance(v, dict):
                return v.get("value","").strip()
            return str(v).strip()
    return ""


In [None]:
# Cell 7 — main ingest loop: save text, download up to max_images images per page, write metadata
from pathlib import Path
import json, os

MAX_IMAGES_PER_PAGE = 40

results = []

for title in tqdm(MONUMENTS, desc="Monuments"):
    try:
        canonical_title, text = fetch_wikipedia_text(title)
        safe_name = sanitize_filename(canonical_title)
        # Save raw text
        txt_path = RAW_TEXT / (safe_name + ".txt")
        txt_path.write_text(text or "", encoding="utf8")
        # Find image files referenced on page
        img_files = fetch_image_file_titles(canonical_title)
        # Prepare image folder
        img_folder = RAW_IMAGES / safe_name
        img_folder.mkdir(parents=True, exist_ok=True)
        meta_records = []
        # Page metadata record
        page_rec = {"type":"page", "title": canonical_title, "text_file": str(txt_path), "word_count": len((text or "").split())}
        meta_records.append(page_rec)
        downloaded = 0
        for ft in img_files:
            if downloaded >= MAX_IMAGES_PER_PAGE:
                break
            try:
                info = get_commons_image_info(ft)
                if not info or not info.get("url"): 
                    continue
                url = info.get("url")
                fname = ft.split(":",1)[-1]
                fname = sanitize_filename(unquote(fname))
                local_path = img_folder / fname
                # download if missing
                if not local_path.exists():
                    resp = session.get(url, stream=True, timeout=40)
                    time.sleep(SLEEP)
                    resp.raise_for_status()
                    with open(local_path, "wb") as f:
                        for chunk in resp.iter_content(1024*8):
                            if chunk:
                                f.write(chunk)
                caption = extract_caption_from_extmetadata(info.get("extmetadata", {}))
                rec = {
                    "type":"image",
                    "monument_title": canonical_title,
                    "file_title": ft,
                    "image_url": url,
                    "local_path": str(local_path),
                    "caption": caption,
                    "mime": info.get("mime"),
                    "width": info.get("width"),
                    "height": info.get("height"),
                    "extmetadata_keys": list(info.get("extmetadata", {}).keys())
                }
                meta_records.append(rec)
                downloaded += 1
            except Exception as e:
                print("  warn: skip image", ft, e)
                continue
        # write per-monument meta jsonl (page first then images)
        meta_file = META / (safe_name + ".jsonl")
        with open(meta_file, "w", encoding="utf8") as mf:
            for r in meta_records:
                mf.write(json.dumps(r, ensure_ascii=False) + "\n")
        results.append({"title": canonical_title, "text_file": str(txt_path), "meta_file": str(meta_file), "images_downloaded": downloaded})
        print(f"Saved {canonical_title}: text -> {txt_path}, images -> {downloaded}")
    except Exception as e:
        print("Error ingesting", title, e)


Monuments:   0%|          | 0/3 [00:00<?, ?it/s]

  warn: skip image File:Allah-green.svg ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
  warn: skip image File:Commons-logo.svg HTTPSConnectionPool(host='commons.wikimedia.org', port=443): Read timed out. (read timeout=20)
  warn: skip image File:Detail of plant motifs on Taj Mahal wall.jpg HTTPSConnectionPool(host='upload.wikimedia.org', port=443): Read timed out. (read timeout=40)
  warn: skip image File:Taj Mahal - Mausoleum der Liebe (CC BY-SA 4.0).webm HTTPSConnectionPool(host='upload.wikimedia.org', port=443): Read timed out.
  warn: skip image File:Taj Mahal 9794.jpg ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


Monuments:  33%|███▎      | 1/3 [05:28<10:56, 328.05s/it]

Saved Taj Mahal: text -> /Users/jaydobariya/Desktop/RAG Project/data/raw/text/Taj Mahal.txt, images -> 35


Monuments:  67%|██████▋   | 2/3 [06:36<02:55, 175.63s/it]

Saved Qutb Minar: text -> /Users/jaydobariya/Desktop/RAG Project/data/raw/text/Qutb Minar.txt, images -> 20


Monuments: 100%|██████████| 3/3 [09:08<00:00, 182.83s/it]

Saved Red Fort: text -> /Users/jaydobariya/Desktop/RAG Project/data/raw/text/Red Fort.txt, images -> 33





FileNotFoundError: [Errno 2] No such file or directory: 'data/ingest_summary.json'

In [16]:
# Cell 8 — summary / quick checks
from pathlib import Path
import glob
print("Text files:", len(list(RAW_TEXT.glob("*.txt"))))
print("Meta files:", len(list(META.glob("*.jsonl"))))
img_count = sum(1 for _ in RAW_IMAGES.rglob("*.*"))
print("Image files total:", img_count)

Text files: 3
Meta files: 3
Image files total: 89


In [20]:
# Cell 9 — light text cleaning and save processed copy
import re
from pathlib import Path

def clean_wikipedia_text(text):
    if not text:
        return ""
    text = re.sub(r'\[\s*[0-9a-zA-Z.,:;–—#\s-]{1,30}\s*\]', '', text)
    text = re.sub(r'\(citation needed\)', '', text, flags=re.IGNORECASE)
    # cut at common tail sections
    for marker in ["== References ==", "==References==", "== External links ==", "==External links==", "== See also ==", "==See also=="]:
        idx = text.find(marker)
        if idx != -1:
            text = text[:idx]
    text = re.sub(r'=+', ' ', text)
    text = text.replace("&nbsp;", " ").replace("&amp;", "&")
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

PROCESSED = Path("../data/processed")
PROCESSED.mkdir(parents=True, exist_ok=True)

for txt in RAW_TEXT.glob("*.txt"):
    raw = txt.read_text(encoding="utf8")
    cleaned = clean_wikipedia_text(raw)
    out = PROCESSED / txt.name  # same filename
    out.write_text(cleaned, encoding="utf8")
    print("Cleaned:", txt.name, "->", out)
print("Processed text saved to data/processed/")


Cleaned: Qutb Minar.txt -> ../data/processed/Qutb Minar.txt
Cleaned: Red Fort.txt -> ../data/processed/Red Fort.txt
Cleaned: Taj Mahal.txt -> ../data/processed/Taj Mahal.txt
Processed text saved to data/processed/
