In [4]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
    --------------------------------------- 0.3/18.7 MB ? eta -:--:--
   - -------------------------------------- 0.8/18.7 MB 2.1 MB/s eta 0:00:09
   -- ------------------------------------- 1.0/18.7 MB 1.9 MB/s eta 0:00:10
   -- ------------------------------------- 1.3/18.7 MB 2.1 MB/s eta 0:00:09
   --- ------------------------------------ 1.8/18.7 MB 1.9 MB/s eta 0:00:10
   ---- ----------------------------------- 2.1/18.7 MB 1.8 MB/s eta 0:00:10
   ------ --------------------------------- 2.9/18.7 MB 2.0 MB/s eta 0:00:08
   ------- -------------------------------- 3.7/18.7 MB 2.3 MB/s eta 0:00:07
   --------- ------------------------------ 4.5/18.7 MB 2.5 MB/s eta 0:00:06
   ---------- ----------------------------- 5.0/18.7 MB 2.5 MB/s eta 0:00:06
   ------------ 

In [5]:
import re
import json
import hashlib
import time
from pathlib import Path
import fitz  # PyMuPDF

In [6]:
def slugify(s):
    """Convert text to a safe slug for IDs."""
    return re.sub(r'[^A-Za-z0-9]+', '-', s.strip()).strip('-')

In [7]:
def sha256_file(path):
    """Calculate SHA-256 hash of a file."""
    h = hashlib.sha256()
    with open(path, 'rb') as f:
        for b in iter(lambda: f.read(1 << 20), b''):
            h.update(b)
    return h.hexdigest()

In [8]:
def chunk_text(text, max_chars=2200, overlap=0):
    """Split text into chunks of max_chars length."""
    start = 0
    while start < len(text):
        end = min(start + max_chars, len(text))
        yield (start, end, text[start:end].strip())
        start = max(end - overlap, end)

In [9]:
def make_ids(country, visa_type, year, doc_slug, page_num, seq):
    """Generate doc_id and chunk_id."""
    doc_id = f"{country}-{slugify(visa_type)}-{year}-{doc_slug}"
    chunk_id = f"{doc_id}-Pg{page_num}-{seq:03d}"
    return doc_id, chunk_id

In [10]:
def pdf_to_chunks(pdf_path, meta, out_path):
    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")

   # Read PDF
    doc = fitz.open(str(pdf_path))
    doc_sha = sha256_file(pdf_path)
    meta["doc_sha256"] = doc_sha

    all_chunks = []
    seq = 1

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        for cs, ce, chunk in chunk_text(text):
            doc_id, chunk_id = make_ids(meta["country"], meta["visa_type"], meta["year"], meta["doc_slug"], page_num+1, seq)
            chunk_meta = {
                "chunk_id": chunk_id,
                "doc_id": doc_id,
                "source": meta["source"],
                "url": meta["url"],
                "country": meta["country"],
                "visa_type": meta["visa_type"],
                "effective_date": meta["effective_date"],
                "version": meta["version"],
                "doc_sha256": meta["doc_sha256"],
                "retrieved_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
                "page": page_num+1,
                "pages": [page_num+1],
                "section_title": None,  # Optional: detect from headings
                "language": "en",
                "char_start": cs,
                "char_end": ce,
                "text": chunk
            }
            all_chunks.append(chunk_meta)
            seq += 1

    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    with open(out_path, "w", encoding="utf-8") as f:
        for ch in all_chunks:
            f.write(json.dumps(ch, ensure_ascii=False) + "\n")

    print(f"✅ Saved {len(all_chunks)} chunks to {out_path}")

In [15]:
if __name__ == "__main__":
    pdf_file = r"C:\Users\ASUS\OneDrive\Desktop\Student+and+Child+Student.pdf"
    output_file = r"C:\Users\ASUS\OneDrive\Desktop\processed\UK_StudentVisa_chunks.json"

    meta_info = {
        "country": "UK",
        "visa_type": "Student and Child Student",
        "year": "2025",
        "doc_slug": "UK Student Visa Guide 2024",
        "source": "Student and Child Student",
        "url": "https://gov.uk/student-visa",
        "effective_date": "2025-07-16",
        "version": "11.0"
    }

    pdf_to_chunks(pdf_file, meta_info, output_file)

✅ Saved 144 chunks to C:\Users\ASUS\OneDrive\Desktop\processed\UK_StudentVisa_chunks.json
