in terminal:
conda init
conda activate china_dir

In [None]:
%pip install -U PyMuPDF
%pip install -U ollama

In [12]:
import fitz  # PyMuPDF
import json
import sys
import os
import re
import ollama

In [2]:
!pwd

/Users/lukasfiller/dev/china_directory


In [None]:
# In your configuration cell, replace the MAX_PAGES_TO_PROCESS section with this:
# --- START OF CONFIGURATION ---
# Change these values for your specific run
PDF_PATH = "/Users/lukasfiller/dev/china_directory/China Directory 2024-2p5-50.pdf"
OUTPUT_PATH = "output.json"
MODEL_NAME = "llama3.1:8b-instruct-fp16"
OLLAMA_HOST = "http://127.0.0.1:11434"
OLLAMA_CLIENT_TIMEOUT = 600  # Seconds
DEBUG_PRINT_JSON = True

# --- NEW: Page Range Control ---
# Set specific page ranges for batch processing
START_PAGE = 1      # First page to process (1-indexed)
END_PAGE = 5        # Last page to process (inclusive, 1-indexed)
# Set both to None to process entire document
# Examples:
# START_PAGE = 1, END_PAGE = 10    # Process pages 1-10
# START_PAGE = 11, END_PAGE = 20   # Process pages 11-20
# START_PAGE = None, END_PAGE = None # Process entire document
# --- END OF CONFIGURATION ---

# This is the instruction set we developed, telling the LLM its role and rules.
SYSTEM_PROMPT = """
You are an expert AI data extraction agent. Your sole purpose is to analyze a single page from a specific multi-lingual PDF document about government personnel and convert its contents into a structured JSON array.

The source is a formal personnel directory with a three-column layout. The left column contains English titles and Pinyin names. The middle column contains Chinese/Japanese titles and names, often with parenthetical metadata. The right column contains dates.

OUTPUT: Single valid JSON array following this schema:
[
  {
    "source_pdf_page": integer,
    "organization_name_english": "string",
    "organization_name_chinese": "string", 
    "metadata": {
      "establishment_date": "string | null",
      "list_order_note": "string | null",
      "count": "integer | null"
    },
    "positions": [
      {
        "title_english": "string",
        "title_chinese": "string",
        "personnel": [
          {
            "name_pinyin": "string",
            "name_chinese": "string",
            "raw_cn_jp_entry": "string",
            "assumed_office_date": "string | null",
            "birth_year": "integer | null",
            "birth_month": "integer | null", 
            "birth_day": "integer | null",
            "cross_reference_symbol": "string | null",
            "gender": "string | null",
            "ethnicity": "string | null",
            "rank_english": "string | null",
            "rank_chinese": "string | null",
            "other_notes_en": [],
            "other_notes_cn_jp": []
          }
        ]
      }
    ],
    "sub_organizations": []
  }
]

CRITICAL RULES:
1. Data in parentheses () next to names is EXCLUSIVELY personal metadata (birth date, rank, gender, etc.)
2. Date in far right column is EXCLUSIVELY the assumed_office_date
3. Extract symbols ☆, ※, ◎, ○ from names to cross_reference_symbol field
4. Parse (f) or (女) -> gender: "female", default is "male"
5. Parse (Gen) or (上将) -> rank fields
6. Parse (Tujia) or (土家族) -> ethnicity, default is "Han"
7. Always include raw_cn_jp_entry with complete unmodified Chinese/Japanese string
8. Handle one-to-many mappings where one English name maps to multiple Chinese names

Return ONLY the JSON array, no explanations.
"""

In [None]:
# In Cell 3
def preprocess_pdf_page(page):
    """
    Extracts text from a PDF page and reconstructs the three-column layout
    (English/Pinyin, Chinese/Japanese, Dates) with proper column association.
    """
    words = page.get_text("words")
    if not words:
        return ""

    # Sort words by vertical position first, then horizontal
    words.sort(key=lambda w: (w[1], w[0]))
    
    # Determine column boundaries (assuming 3 columns)
    page_width = page.rect.width
    col1_boundary = page_width * 0.33  # English/Pinyin
    col2_boundary = page_width * 0.67  # Chinese/Japanese
    # col3 is dates (remaining right side)
    
    # Group words into lines and columns
    lines = {}
    for w in words:
        x0, y0, x1, y1, text = w[:5]
        y_key = round(y0 / 5) * 5  # Group nearby lines
        
        if y_key not in lines:
            lines[y_key] = {'col1': [], 'col2': [], 'col3': []}
        
        if x0 < col1_boundary:
            lines[y_key]['col1'].append(text)
        elif x0 < col2_boundary:
            lines[y_key]['col2'].append(text)
        else:
            lines[y_key]['col3'].append(text)
    
    # Reconstruct with proper column indicators
    processed_text = []
    for y_key in sorted(lines.keys()):
        col1_text = " ".join(lines[y_key]['col1']).strip()
        col2_text = " ".join(lines[y_key]['col2']).strip()
        col3_text = " ".join(lines[y_key]['col3']).strip()
        
        # Only add non-empty lines
        if col1_text or col2_text or col3_text:
            line_parts = []
            if col1_text: line_parts.append(f"EN: {col1_text}")
            if col2_text: line_parts.append(f"CN: {col2_text}")
            if col3_text: line_parts.append(f"DATE: {col3_text}")
            
            if line_parts:
                processed_text.append(" | ".join(line_parts))
    
    return "\n".join(processed_text)

# In Cell 3
def get_json_from_llm(page_text, model_name, host, page_num):
    """
    Sends the pre-processed page text to a specific Ollama host and gets a JSON response.
    """
    page_text_size = len(page_text)
    print(f"    -> Sending {page_text_size} characters to model '{model_name}'...")
    
    # Add page number context to the prompt
    user_prompt = f"PAGE {page_num}:\n{page_text}"
    
    try:
        client = ollama.Client(host=host, timeout=OLLAMA_CLIENT_TIMEOUT)
        response = client.chat(
            model=model_name,
            messages=[
                {'role': 'system', 'content': SYSTEM_PROMPT},
                {'role': 'user', 'content': user_prompt}
            ],
            options={'temperature': 0.0},
            format='json'
        )
        
        content = response['message']['content']
        print(f"    -> Raw response (first 200 chars): {content[:200]}")
        
        parsed_json = json.loads(content)

        # Ensure page number is set in the response
        if isinstance(parsed_json, list):
            for org in parsed_json:
                if isinstance(org, dict):
                    org['source_pdf_page'] = page_num
        
        if isinstance(parsed_json, dict):
            parsed_json['source_pdf_page'] = page_num
            return [parsed_json]
        elif isinstance(parsed_json, list):
            return parsed_json
        else:
            print(f"   - ERROR: Unexpected JSON type: {type(parsed_json)}")
            return None
            
    except Exception as e:
        print(f"   - ERROR: {e}")
        return None




# In Cell 3
def stitch_json_results(all_pages_data):
    """
    Merges the list of JSON objects from each page into a single,
    hierarchically correct JSON structure. Now with improved error handling.
    """
    if not all_pages_data: return []
    final_data = []
    
    for i, page_data in enumerate(all_pages_data):
        if not page_data: continue

        for org_data in page_data:
            # --- NEW ROBUSTNESS CHECK ---
            # Ensure org_data is a dictionary with the required key
            if not isinstance(org_data, dict) or 'organization_name_english' not in org_data:
                print(f"  - WARNING: Skipping malformed organization data on page {i+1}. Data was: {org_data}")
                continue
            # --- END ROBUSTNESS CHECK ---

            # Check if this organization is a continuation of the previous one
            if (final_data and final_data[-1].get('organization_name_english') == org_data.get('organization_name_english')):
                last_org = final_data[-1]
                if org_data.get('positions'):
                    if (last_org.get('positions') and org_data.get('positions') and last_org['positions'][-1].get('title_english') == org_data['positions'][0].get('title_english')):
                        last_org['positions'][-1]['personnel'].extend(org_data['positions'][0].get('personnel', []))
                        last_org['positions'].extend(org_data['positions'][1:])
                    else:
                        last_org['positions'].extend(org_data['positions'])
                if org_data.get('sub_organizations'):
                    last_org['sub_organizations'].extend(org_data.get('sub_organizations', []))
            else:
                final_data.append(org_data)
                
    return final_data

# In Cell 3, after the stitch_json_results function
def save_data_to_file(data, filepath):
    """Saves the provided data structure to a JSON file."""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"  - CRITICAL WARNING: Failed to save data to {filepath}. Error: {e}")

In [None]:
# In Cell 4 (Execution)
import time

print(f"Starting processing for '{PDF_PATH}'...")
print(f"Using model: '{MODEL_NAME}' at host '{OLLAMA_HOST}'")
print("-" * 50)

# --- RESUME LOGIC: Load existing data if it exists ---
all_pages_data = []
if os.path.exists(OUTPUT_PATH):
    print(f"Found existing output file at '{OUTPUT_PATH}'. Attempting to load and resume.")
    try:
        with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
            print("Starting a fresh run, but will save progress incrementally.")
    except Exception as e:
        print(f"  - WARNING: Could not read existing file. Starting fresh. Error: {e}")

all_pages_data = [] # This will hold the JSON from each page
start_page = 0      # Default to starting from the beginning

# A better resume strategy:
# We will create a temporary cache directory to store the result of each page.
CACHE_DIR = "page_cache"
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

# --- NEW: OLLAMA SERVER PRE-CHECK ---
print(f"Attempting to connect to Ollama server at {OLLAMA_HOST}...")
try:
    client = ollama.Client(host=OLLAMA_HOST, timeout=10) # Short timeout for pre-check
    client.list() # A lightweight command to check connectivity
    print("  -> Successfully connected to Ollama server.")
except Exception as e:
    print(f"  - FATAL ERROR: Could not connect to Ollama server at {OLLAMA_HOST}.")
    print(f"  - Error details: {e}")
    print("  - Please ensure Ollama is running and accessible.")
    ollama_available = False
else:
    ollama_available = True


# In Cell 4 (Execution) - replace the pages_to_run calculation section
if ollama_available:
    if not os.path.exists(PDF_PATH):
        print(f"FATAL ERROR: PDF file not found at '{PDF_PATH}'")
    else:
        doc = fitz.open(PDF_PATH)
        num_pages = len(doc)
        
        # --- NEW: Page Range Logic ---
        if START_PAGE is None and END_PAGE is None:
            # Process entire document
            start_page_idx = 0
            end_page_idx = num_pages
            print(f"Processing entire document: {num_pages} pages")
        else:
            # Process specific range
            start_page_idx = (START_PAGE - 1) if START_PAGE is not None else 0
            end_page_idx = END_PAGE if END_PAGE is not None else num_pages
            
            # Validate page range
            if start_page_idx < 0:
                start_page_idx = 0
            if end_page_idx > num_pages:
                end_page_idx = num_pages
            if start_page_idx >= end_page_idx:
                print(f"ERROR: Invalid page range. START_PAGE ({START_PAGE}) must be less than END_PAGE ({END_PAGE})")
                print(f"Document has {num_pages} pages.")
                start_page_idx = 0
                end_page_idx = 0
            
            pages_to_process = end_page_idx - start_page_idx
            print(f"Processing pages {start_page_idx + 1} to {end_page_idx} ({pages_to_process} pages) out of {num_pages} total pages")
        
        if start_page_idx >= end_page_idx:
            print("No pages to process. Exiting.")
        else:
            total_start_time = time.time()
            successful_pages_count = 0
            attempted_pages_count = 0
            
            # --- MAIN LOOP WITH INCREMENTAL SAVING ---
            for i in range(start_page_idx, end_page_idx):
                page_num = i + 1
                page_cache_path = os.path.join(CACHE_DIR, f"page_{page_num}.json")
                page_successfully_processed_or_cached = False # Flag for this page
                
                print(f"\n--- Processing Page {page_num} of {num_pages} (batch: {i - start_page_idx + 1}/{end_page_idx - start_page_idx}) ---")

                # --- RESUME LOGIC: Check if page is already cached ---
                if os.path.exists(page_cache_path):
                    print(f"  - Result: Page {page_num} already processed. Loading from cache.")
                    try:
                        with open(page_cache_path, 'r', encoding='utf-8') as f:
                            cached_data = json.load(f)
                        if cached_data is None: # Indicates a previous failure
                            print("  - Cached file indicates a previous failure. Re-processing.")
                            # Do not append to all_pages_data yet, proceed to reprocessing
                        else:
                            print("  - Successfully loaded valid data from cache.")
                            all_pages_data.append(cached_data) # Add valid cached data
                            successful_pages_count += 1
                            page_successfully_processed_or_cached = True
                    except json.JSONDecodeError:
                        print("  - WARNING: Cache file is corrupted. Re-processing.")
                    except Exception as e:
                        print(f"  - WARNING: Could not load from cache ({e}). Re-processing.")

                if page_successfully_processed_or_cached:
                    continue # Skip to the next page
                
                # --- If not cached, or cache indicated failure, process the page normally ---
                page_start_time = time.time()
                page = doc[i]
                
                print("  - Step 1: Pre-processing page text...")
                page_text = preprocess_pdf_page(page)
                
                if not page_text.strip():
                    print("  - Result: Page is empty, skipping.")
                    # Cache the empty result so we don't re-process it
                    with open(page_cache_path, 'w', encoding='utf-8') as f: json.dump(None, f)
                    continue
                
                print("  - Step 2: Calling LLM for JSON extraction...")
                print("--- DEBUG: Preprocessed Page Text START ---")
                print(page_text)
                print("--- DEBUG: Preprocessed Page Text END ---")
                print("  - Step 2: Calling LLM for JSON extraction...")
                page_json = get_json_from_llm(page_text, MODEL_NAME, OLLAMA_HOST, page_num)  # Add page_num parameter
                
                # Cache the result, whether it's successful (JSON) or a failure (None)
                with open(page_cache_path, 'w', encoding='utf-8') as f:
                    json.dump(page_json, f, ensure_ascii=False, indent=2)

                if page_json:
                    all_pages_data.append(page_json)
                    successful_pages_count += 1
                    num_orgs = len(page_json)
                    print(f"  - Step 3: Success! Extracted {num_orgs} top-level organization(s) from page.")
                    if DEBUG_PRINT_JSON:
                        print("    -- Debug: JSON output for this page --"); print(json.dumps(page_json, ensure_ascii=False, indent=2)); print("    -- End Debug --")
                else:
                    print("  - Step 3: Failed to extract valid JSON from this page. Skipping.")
                
                page_end_time = time.time()
                page_duration = page_end_time - page_start_time
                print(f"--- Page {page_num} finished in {page_duration:.2f} seconds ---")

            # --- FINAL STITCHING AND SAVING ---
            print("\n" + "=" * 50)
            print("All pages processed or loaded from cache. Now stitching final results...")
            
            final_stitched_data = stitch_json_results(all_pages_data)
            print("Stitching complete.")

            print(f"Writing final structured data to '{OUTPUT_PATH}'...")
            save_data_to_file(final_stitched_data, OUTPUT_PATH)

            total_end_time = time.time()
            total_duration = total_end_time - total_start_time
            print("\n" + "=" * 50)
            print("✅ PROCESSING COMPLETE ✅")
            print(f"Processed pages {start_page_idx + 1} to {end_page_idx} ({end_page_idx - start_page_idx} pages)")
            print(f"Successfully processed and extracted data from {successful_pages_count} page(s).")
            print(f"Total execution time: {total_duration:.2f} seconds.")
            print(f"Final output saved to: {OUTPUT_PATH}")
            if successful_pages_count == 0 and (end_page_idx - start_page_idx) > 0:
                print(f"WARNING: No data was successfully extracted. {OUTPUT_PATH} will contain an empty list.")


elif not ollama_available:
    print("Skipped PDF processing as Ollama server was not available.")
else: # Ollama available, but PDF might have been missing or other setup issue
    print("Skipped PDF processing due to missing PDF file or other setup issue prior to page loop.")

In [None]:
# Add this debugging cell to check the cache contents
import json
import os

CACHE_DIR = "page_cache"
print("=== CACHE DIRECTORY CONTENTS ===")

if os.path.exists(CACHE_DIR):
    cache_files = [f for f in os.listdir(CACHE_DIR) if f.endswith('.json')]
    print(f"Found {len(cache_files)} cache files: {cache_files}")
    
    for cache_file in sorted(cache_files):
        cache_path = os.path.join(CACHE_DIR, cache_file)
        print(f"\n--- {cache_file} ---")
        try:
            with open(cache_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if data is None:
                print("  Content: None (indicates processing failure)")
            elif isinstance(data, list):
                print(f"  Content: List with {len(data)} items")
                if data:
                    print(f"  First item keys: {list(data[0].keys()) if isinstance(data[0], dict) else 'Not a dict'}")
            else:
                print(f"  Content: {type(data)} - {str(data)[:100]}...")
        except Exception as e:
            print(f"  Error reading file: {e}")
else:
    print("Cache directory does not exist")

In [None]:
# Add this enhanced debugging cell to see the actual content
import json
import os

CACHE_DIR = "page_cache"
print("=== DETAILED CACHE ANALYSIS ===")

if os.path.exists(CACHE_DIR):
    cache_files = [f for f in os.listdir(CACHE_DIR) if f.endswith('.json')]
    print(f"Found {len(cache_files)} cache files: {cache_files}")
    
    for cache_file in sorted(cache_files):
        cache_path = os.path.join(CACHE_DIR, cache_file)
        print(f"\n--- {cache_file} ---")
        try:
            with open(cache_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if data is None:
                print("  Content: None (indicates processing failure)")
            elif isinstance(data, list):
                print(f"  Content: List with {len(data)} items")
                for i, item in enumerate(data):
                    print(f"  Item {i+1}: {type(item)}")
                    if isinstance(item, dict):
                        print(f"    Keys: {list(item.keys())}")
                        print(f"    Full content: {json.dumps(item, ensure_ascii=False, indent=4)}")
                    else:
                        print(f"    Content: {item}")
            else:
                print(f"  Content: {type(data)} - {str(data)[:200]}...")
        except Exception as e:
            print(f"  Error reading file: {e}")
else:
    print("Cache directory does not exist")

In [None]:
# Test the new system with a known example
test_text = """
EN: Central Commission for Comprehensive Rule of Law | CN: 中央全面依法治国委員会(2018. 3) | DATE: 
EN: Chairperson | CN: 主任 | DATE: 
EN: Xi Jinping | CN: ☆習近平(53.6) | DATE: 18. 8
EN: Deputy Directors | CN: 副主任 | DATE: 
EN: He Rong (f) | CN: ◎賀 栄(女 62.10) | DATE: 23. 2
"""

print("=== TESTING NEW SYSTEM ===")
result = get_json_from_llm(test_text, MODEL_NAME, OLLAMA_HOST, 32)
if result:
    print(json.dumps(result, ensure_ascii=False, indent=2))

In [11]:
# Clear cache cell
import os
import shutil

CACHE_DIR = "page_cache"
if os.path.exists(CACHE_DIR):
    shutil.rmtree(CACHE_DIR)
    print("Cache cleared")
    
# Also clear output file
if os.path.exists("output.json"):
    os.remove("output.json")
    print("Output file cleared")

Cache cleared
Output file cleared
