# Jerusalem RAG Explorer - Data Fetching

**By Yotam Nachtomy-Katz** | ID: 211718366 | Haifa University

This notebook demonstrates how to fetch multilingual historical documents from Archive.org for the RAG system.

## 1. Setup and Imports

In [None]:
import time
import requests
from pathlib import Path
from typing import Optional

# API endpoints
SEARCH = "https://archive.org/advancedsearch.php"
META = "https://archive.org/metadata/"
HEADERS = {"User-Agent": "JerusalemRAG/2.0 (scholarly research)"}

## 2. Curated Manuscript List

We use a curated list of primary sources from the *Recueil des historiens des croisades* collection, which includes Latin, Arabic, Greek, Armenian, and French texts.

In [None]:
CRUSADE_MANUSCRIPTS = [
    # Historiens occidentaux (Latin chronicles)
    {
        "identifier": "RecueilDesHistoriensDesCroisadesOcc4",
        "language": "la",
        "description": "Latin chronicles including William of Tyre",
    },
    {
        "identifier": "RecueilDesHistoriensDesCroisadesOccidentaux12",
        "language": "la",
        "description": "Latin chronicles Vol 1-2",
    },
    {
        "identifier": "RecueilDesHistoriensDesCroisadesOccidentaux2",
        "language": "la",
        "description": "Latin chronicles Vol 2",
    },
    # Historiens orientaux (Arabic sources)
    {
        "identifier": "recueildeshistor01acad",
        "language": "ar",
        "description": "Arabic chronicles Vol 1 - Ibn al-Athir and others",
    },
    {
        "identifier": "recueildeshistor02acad",
        "language": "ar",
        "description": "Arabic chronicles Vol 2",
    },
    # Historiens grecs (Byzantine Greek sources)
    {
        "identifier": "RecueilDesHistoriensDesCroisadesGrecs1",
        "language": "el",
        "description": "Greek historians Vol 1 - Anna Comnena and others",
    },
    # Documents arméniens (Armenian sources)
    {
        "identifier": "RecueilDesHistoriensDesCroisadesDocumentsArmeniensTomePremier",
        "language": "hy",
        "description": "Armenian documents Vol 1",
    },
    # Assises de Jérusalem (Old French legal texts)
    {
        "identifier": "AssisesDeJerusalemBeugnotVol1",
        "language": "fr",
        "description": "Laws of the Kingdom of Jerusalem Vol 1 (Old French)",
    },
]

print(f"Total manuscripts to fetch: {len(CRUSADE_MANUSCRIPTS)}")

## 3. Helper Functions

In [None]:
def get_language_name(code: str) -> str:
    """Get full language name from ISO code."""
    names = {
        "la": "Latin",
        "ar": "Arabic",
        "el": "Greek",
        "fr": "French",
        "hy": "Armenian",
        "en": "English",
    }
    return names.get(code, code)


def get_metadata(identifier: str) -> dict:
    """Fetch metadata for an Archive.org item."""
    r = requests.get(META + identifier, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.json()

## 4. Download Function

This function downloads a manuscript's OCR text and prepends metadata headers for later processing.

In [None]:
def download_manuscript(
    identifier: str,
    language: str,
    description: str,
    out_dir: Path,
) -> Optional[Path]:
    """Download a manuscript's text file from Archive.org."""
    print(f"Fetching: {identifier}...")

    try:
        meta = get_metadata(identifier)
    except Exception as e:
        print(f"  Failed to get metadata: {e}")
        return None

    # Find text files
    files = meta.get("files", [])
    txts = [f for f in files if str(f.get("name", "")).lower().endswith(".txt")]

    if not txts:
        print(f"  No text file found for {identifier}")
        return None

    # Download first text file (usually the OCR djvu.txt)
    fname = txts[0]["name"]
    url = f"https://archive.org/download/{identifier}/{fname}"

    try:
        r = requests.get(url, headers=HEADERS, timeout=120)
        r.raise_for_status()
        text = r.text
    except Exception as e:
        print(f"  Failed to download: {e}")
        return None

    # Skip if too short
    if len(text) < 1000:
        print(f"  Text too short ({len(text)} chars), skipping")
        return None

    # Get title from metadata
    title = meta.get("metadata", {}).get("title", identifier)
    if isinstance(title, list):
        title = title[0]

    # Build header with metadata
    header = f"""TITLE: {title}
IDENTIFIER: {identifier}
LANGUAGE: {language}
LANGUAGE_NAME: {get_language_name(language)}
SOURCE: Archive.org
URL: https://archive.org/details/{identifier}
DESCRIPTION: {description}

---

"""

    # Save file
    filepath = out_dir / f"{identifier}.txt"
    filepath.write_text(header + text, encoding="utf-8")
    print(f"  Saved: {filepath.name} ({len(text):,} chars)")
    return filepath

## 5. Fetch All Manuscripts

Now let's fetch all the curated manuscripts. This may take several minutes due to rate limiting.

In [None]:
# Create output directory
out_dir = Path("../data/raw/archive_v2")
out_dir.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {out_dir.absolute()}")

In [None]:
# Fetch manuscripts (with delay between requests)
saved = []
delay = 1.0  # seconds between requests

for i, manuscript in enumerate(CRUSADE_MANUSCRIPTS):
    print(f"\n[{i+1}/{len(CRUSADE_MANUSCRIPTS)}] ", end="")
    
    path = download_manuscript(
        identifier=manuscript["identifier"],
        language=manuscript["language"],
        description=manuscript.get("description", ""),
        out_dir=out_dir,
    )
    
    if path:
        saved.append(path)
    
    if i < len(CRUSADE_MANUSCRIPTS) - 1:
        time.sleep(delay)

print(f"\n\nFetched {len(saved)}/{len(CRUSADE_MANUSCRIPTS)} manuscripts")

## 6. Summary by Language

In [None]:
# Count by language
lang_counts = {}
for m in CRUSADE_MANUSCRIPTS:
    lang = m["language"]
    lang_counts[lang] = lang_counts.get(lang, 0) + 1

print("Manuscripts by language:")
for lang, count in lang_counts.items():
    print(f"  {get_language_name(lang)}: {count}")

## 7. Preview a Downloaded File

In [None]:
# Preview the first downloaded file
if saved:
    sample_file = saved[0]
    content = sample_file.read_text(encoding="utf-8")
    print(f"Preview of {sample_file.name}:")
    print("=" * 60)
    print(content[:2000])
    print("...")

## Next Steps

After fetching documents, proceed to:
1. **02_ingestion.ipynb** - Chunk, translate, and embed the documents
2. **03_retrieval.ipynb** - Query the index and generate answers