in terminal:
conda init
conda activate china_dir

In [None]:
%pip install -U PyMuPDF
%pip install -U ollama

In [1]:
import fitz  # PyMuPDF
import json
import sys
import os
import re
import ollama

In [2]:
!pwd

/Users/lukasfiller/dev/china_directory


In [19]:
# --- START OF CONFIGURATION ---
# Change these values for your specific run
PDF_PATH = "/Users/lukasfiller/dev/china_directory/China Directory 2024-2p5-50.pdf"
OUTPUT_PATH = "output.json"
MODEL_NAME = "llama3.3:70b-instruct-q4_K_M"
OLLAMA_HOST = "http://127.0.0.1:11434"
OLLAMA_CLIENT_TIMEOUT = 300  # Seconds
DEBUG_PRINT_JSON = True

# --- NEW: Set a limit for testing ---
# Set to a number (e.g., 5) for a sample run, or None to process the whole document.
MAX_PAGES_TO_PROCESS = 5 
# --- END OF CONFIGURATION ---


# --- END OF CONFIGURATION ---

# This is the instruction set we developed, telling the LLM its role and rules.
SYSTEM_PROMPT = """
You are an expert data extraction system. Your sole function is to parse the provided text from a single page of a Chinese party-state leadership directory and convert its contents into a valid JSON array. Adhere strictly to the schema and parsing rules. Produce only the final JSON array as your output, without any commentary, apologies, or markdown code fences.

The input text is from a two-column document. I have pre-processed it by merging lines from the left and right columns. A "||" separator often indicates the split between the columns.

**Required Output JSON Schema:**
[
  {
    "organization_name_english": "string",
    "organization_name_chinese": "string",
    "document_section_title": "string | null",
    "metadata": {},
    "sub_organizations": [],
    "positions": [
      {
        "title_english": "string",
        "title_chinese": "string",
        "metadata": { "count": "integer | null", "list_order_note": "string | null" },
        "personnel": [
          {
            "name_pinyin": "string",
            "name_chinese": "string",
            "dob_year": "integer | null",
            "dob_month": "integer | null",
            "assumed_office_date": "YYYY-MM-DD" | "YYYY-MM" | "YYYY" | null,
            "cross_reference_symbols": ["string"],
            "gender": "male" | "female",
            "ethnicity": "string",
            "rank": "string | null",
            "rank_chinese": "string | null",
            "other_notes": ["string"]
          }
        ]
      }
    ]
  }
]

**Detailed Parsing Rules:**
- **Hierarchy:** Capture the nested structure of organizations, sub-organizations, and positions.
- **`dob_year` / `dob_month`**: Parse from `(YY.MM)`. A `YY` < 30 is 20xx; otherwise, it's 19xx.
- **`assumed_office_date`**: Parse from the `YY.MM.DD` format into ISO 8601 `YYYY-MM-DD`.
- **`cross_reference_symbols`**: Collect any leading `☆`, `※`, `◎`, `○` symbols.
- **`gender`**: If `(f)` or `(女)` is present, set to "female". **Default is "male".**
- **`ethnicity`**: If an ethnicity like `(Mongolian)` or `(蒙古族)` is present, record it. **Default is "Han".**
- **`rank` & `rank_chinese`**: Map abbreviations like `(Gen)` to `rank` and the Chinese `(上将)` to `rank_chinese`. **Default is `null` if no rank is specified.**
- **`other_notes`**: Place any other parenthetical notes like `(executive)` or `(SPC)` here.
- **Continuations:** If the page seems to continue a list from a previous page (e.g., starts with a list of names without a new header), structure the JSON as if it belongs to the last-mentioned organization/position. The top-level object in your response should reflect this context.
"""

In [20]:
# In Cell 3
def preprocess_pdf_page(page):
    """
    Extracts text from a PDF page and reconstructs the two-column layout
    into a structured markdown-like format that is easier for the LLM to parse.
    """
    # Get words with coordinates
    words = page.get_text("words")
    if not words:
        return ""

    # A simple heuristic for the center column split
    page_center = page.rect.width / 2

    # Group words into lines based on vertical position (y0)
    lines = {}
    for w in words:
        x0, y0, x1, y1, text = w[:5]
        # Use the integer part of y0 as the key to group words on the same line
        y_key = int(y0)
        if y_key not in lines:
            lines[y_key] = []
        lines[y_key].append(w)

    # Sort words within each line by their horizontal position (x0)
    for y_key in lines:
        lines[y_key].sort(key=lambda w: w[0])

    # Reconstruct the page text, aligning left and right columns
    processed_text = []
    sorted_y_keys = sorted(lines.keys())

    for y_key in sorted_y_keys:
        line_words = lines[y_key]
        left_col_text = " ".join([w[4] for w in line_words if w[0] < page_center])
        right_col_text = " ".join([w[4] for w in line_words if w[2] > page_center])

        # Heuristic for detecting headers (usually centered or only in one column)
        is_header = (not left_col_text or not right_col_text) and (len(line_words) < 5)

        if is_header:
            # For headers, just use the full line text
            full_line_text = " ".join([w[4] for w in line_words])
            processed_text.append(f"\n# {full_line_text}\n") # Use markdown header for emphasis
        else:
            # For data rows, use a clear separator
            processed_text.append(f"{left_col_text.strip()} || {right_col_text.strip()}")

    return "\n".join(processed_text)

# In Cell 3
def get_json_from_llm(page_text, model_name, host):
    """
    Sends the pre-processed page text to a specific Ollama host and gets a JSON response.
    Includes detailed logging and better validation.
    """
    page_text_size = len(page_text)
    print(f"    -> Sending {page_text_size} characters to model '{model_name}'...")
    content = "" # Initialize content to avoid reference errors

    try:
        client = ollama.Client(host=host, timeout=OLLAMA_CLIENT_TIMEOUT)
        response = client.chat(
            model=model_name,
            messages=[
                {'role': 'system', 'content': SYSTEM_PROMPT},
                {'role': 'user', 'content': page_text}
            ],
            options={'temperature': 0.0},
            format='json'
        )
        print("    -> Received response from model.")
        content = response['message']['content']
        
        parsed_json = json.loads(content)

        # --- NEW VALIDATION STEP ---
        if not isinstance(parsed_json, list):
            print(f"   - ERROR: LLM returned valid JSON, but it's not a LIST as required by the schema. Type was {type(parsed_json)}.")
            print(f"   - Raw response content was:\n---\n{content}\n---")
            return None # Reject the malformed data
        # --- END VALIDATION STEP ---

        print("    -> Successfully parsed JSON list from response.")
        return parsed_json

    except json.JSONDecodeError as e:
        print(f"   - ERROR: Failed to parse JSON from LLM response. Error: {e}")
        print(f"   - Raw response content was:\n---\n{content}\n---")
        return None
    except ollama.ResponseError as e:
        print(f"   - ERROR: An error occurred with the Ollama API: {e.error}")
        print(f"   - Status code: {e.status_code}")
        return None
    except Exception as e:
        print(f"   - ERROR: An unexpected error occurred (likely a timeout or connection issue): {e}")
        return None
    
    
# In Cell 3
def stitch_json_results(all_pages_data):
    """
    Merges the list of JSON objects from each page into a single,
    hierarchically correct JSON structure. Now with improved error handling.
    """
    if not all_pages_data: return []
    final_data = []
    
    for i, page_data in enumerate(all_pages_data):
        if not page_data: continue

        for org_data in page_data:
            # --- NEW ROBUSTNESS CHECK ---
            # Ensure org_data is a dictionary with the required key
            if not isinstance(org_data, dict) or 'organization_name_english' not in org_data:
                print(f"  - WARNING: Skipping malformed organization data on page {i+1}. Data was: {org_data}")
                continue
            # --- END ROBUSTNESS CHECK ---

            # Check if this organization is a continuation of the previous one
            if (final_data and final_data[-1].get('organization_name_english') == org_data.get('organization_name_english')):
                last_org = final_data[-1]
                if org_data.get('positions'):
                    if (last_org.get('positions') and org_data.get('positions') and last_org['positions'][-1].get('title_english') == org_data['positions'][0].get('title_english')):
                        last_org['positions'][-1]['personnel'].extend(org_data['positions'][0].get('personnel', []))
                        last_org['positions'].extend(org_data['positions'][1:])
                    else:
                        last_org['positions'].extend(org_data['positions'])
                if org_data.get('sub_organizations'):
                    last_org['sub_organizations'].extend(org_data.get('sub_organizations', []))
            else:
                final_data.append(org_data)
                
    return final_data

# In Cell 3, after the stitch_json_results function
def save_data_to_file(data, filepath):
    """Saves the provided data structure to a JSON file."""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"  - CRITICAL WARNING: Failed to save data to {filepath}. Error: {e}")

In [21]:
# In Cell 4 (Execution)
import time

print(f"Starting processing for '{PDF_PATH}'...")
print(f"Using model: '{MODEL_NAME}' at host '{OLLAMA_HOST}'")
print("-" * 50)

# --- RESUME LOGIC: Load existing data if it exists ---
all_pages_data = []
if os.path.exists(OUTPUT_PATH):
    print(f"Found existing output file at '{OUTPUT_PATH}'. Attempting to load and resume.")
    try:
        with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
            # Note: We load the raw page-by-page data, not the stitched version.
            # This is a simplification; a more complex resume would track page numbers.
            # For this script, we'll re-stitch every time, which is safer.
            # We'll re-implement the resume logic based on pages processed.
            # For now, let's start fresh but save incrementally.
            # A better approach is to check how many pages are done.
            # Let's re-architect for a simple incremental save first.
            print("Starting a fresh run, but will save progress incrementally.")
    except Exception as e:
        print(f"  - WARNING: Could not read existing file. Starting fresh. Error: {e}")
# --- Let's refine this resume logic to be more effective ---

all_pages_data = [] # This will hold the JSON from each page
start_page = 0      # Default to starting from the beginning

# A better resume strategy:
# We will create a temporary cache directory to store the result of each page.
CACHE_DIR = "page_cache"
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

# --- NEW: OLLAMA SERVER PRE-CHECK ---
print(f"Attempting to connect to Ollama server at {OLLAMA_HOST}...")
try:
    client = ollama.Client(host=OLLAMA_HOST, timeout=10) # Short timeout for pre-check
    client.list() # A lightweight command to check connectivity
    print("  -> Successfully connected to Ollama server.")
except Exception as e:
    print(f"  - FATAL ERROR: Could not connect to Ollama server at {OLLAMA_HOST}.")
    print(f"  - Error details: {e}")
    print("  - Please ensure Ollama is running and accessible.")
    # Exit the script if Ollama is not available
    # In a notebook environment, this might not stop execution of subsequent cells directly,
    # so we'll rely on a flag or simply not proceed with PDF processing.
    ollama_available = False
else:
    ollama_available = True

if ollama_available:
    if not os.path.exists(PDF_PATH):
        print(f"FATAL ERROR: PDF file not found at '{PDF_PATH}'")
    else:
        doc = fitz.open(PDF_PATH)
        num_pages = len(doc)
        
        pages_to_run = num_pages
        if MAX_PAGES_TO_PROCESS is not None and MAX_PAGES_TO_PROCESS < num_pages:
            print("=" * 50, f"\n!!! SAMPLE MODE: Only processing up to page {MAX_PAGES_TO_PROCESS}. !!!\n", "=" * 50)
            pages_to_run = MAX_PAGES_TO_PROCESS

        total_start_time = time.time()
        successful_pages_count = 0
        attempted_pages_count = 0
        
        # --- MAIN LOOP WITH INCREMENTAL SAVING ---
        for i in range(pages_to_run):
            page_num = i + 1
            page_cache_path = os.path.join(CACHE_DIR, f"page_{page_num}.json")
            page_successfully_processed_or_cached = False # Flag for this page
            
            print(f"\n--- Processing Page {page_num} of {pages_to_run} ---")

            # --- RESUME LOGIC: Check if page is already cached ---
            if os.path.exists(page_cache_path):
                print(f"  - Result: Page {page_num} already processed. Loading from cache.")
                try:
                    with open(page_cache_path, 'r', encoding='utf-8') as f:
                        cached_data = json.load(f)
                    if cached_data is None: # Indicates a previous failure
                        print("  - Cached file indicates a previous failure. Re-processing.")
                        # Do not append to all_pages_data yet, proceed to reprocessing
                    else:
                        print("  - Successfully loaded valid data from cache.")
                        all_pages_data.append(cached_data) # Add valid cached data
                        successful_pages_count += 1
                        page_successfully_processed_or_cached = True
                except json.JSONDecodeError:
                    print("  - WARNING: Cache file is corrupted. Re-processing.")
                except Exception as e:
                    print(f"  - WARNING: Could not load from cache ({e}). Re-processing.")

            if page_successfully_processed_or_cached:
                continue # Skip to the next page
            
            # --- If not cached, or cache indicated failure, process the page normally ---
            page_start_time = time.time()
            page = doc[i]
            
            print("  - Step 1: Pre-processing page text...")
            page_text = preprocess_pdf_page(page)
            
            if not page_text.strip():
                print("  - Result: Page is empty, skipping.")
                # Cache the empty result so we don't re-process it
                with open(page_cache_path, 'w', encoding='utf-8') as f: json.dump(None, f)
                continue
            
            print("  - Step 2: Calling LLM for JSON extraction...")
            page_json = get_json_from_llm(page_text, MODEL_NAME, OLLAMA_HOST)
            
            # Cache the result, whether it's successful (JSON) or a failure (None)
            with open(page_cache_path, 'w', encoding='utf-8') as f:
                json.dump(page_json, f, ensure_ascii=False, indent=2)

            if page_json:
                all_pages_data.append(page_json)
                successful_pages_count += 1
                num_orgs = len(page_json)
                print(f"  - Step 3: Success! Extracted {num_orgs} top-level organization(s) from page.")
                if DEBUG_PRINT_JSON:
                    print("    -- Debug: JSON output for this page --"); print(json.dumps(page_json, ensure_ascii=False, indent=2)); print("    -- End Debug --")
            else:
                print("  - Step 3: Failed to extract valid JSON from this page. Skipping.")
            
            page_end_time = time.time()
            page_duration = page_end_time - page_start_time
            print(f"--- Page {page_num} finished in {page_duration:.2f} seconds ---")

        # --- FINAL STITCHING AND SAVING ---
    print("\n" + "=" * 50)
    print("All pages processed or loaded from cache. Now stitching final results...")
    
    final_stitched_data = stitch_json_results(all_pages_data)
    print("Stitching complete.")

    print(f"Writing final structured data to '{OUTPUT_PATH}'...")
    save_data_to_file(final_stitched_data, OUTPUT_PATH)

    total_end_time = time.time()
    total_duration = total_end_time - total_start_time
    print("\n" + "=" * 50)
    print("✅ PROCESSING COMPLETE ✅")
    print(f"Attempted to process {pages_to_run} page(s).")
    print(f"Successfully processed and extracted data from {successful_pages_count} page(s).")
    print(f"Total execution time: {total_duration:.2f} seconds.")
    print(f"Final output saved to: {OUTPUT_PATH}")
    if successful_pages_count == 0 and pages_to_run > 0:
        print(f"WARNING: No data was successfully extracted. {OUTPUT_PATH} will contain an empty list.")
elif not ollama_available:
    print("Skipped PDF processing as Ollama server was not available.")
else: # Ollama available, but PDF might have been missing or other setup issue
    print("Skipped PDF processing due to missing PDF file or other setup issue prior to page loop.")

Starting processing for '/Users/lukasfiller/dev/china_directory/China Directory 2024-2p5-50.pdf'...
Using model: 'llama3.3:70b-instruct-q4_K_M' at host 'http://127.0.0.1:11434'
--------------------------------------------------
Attempting to connect to Ollama server at http://127.0.0.1:11434...
  -> Successfully connected to Ollama server.
!!! SAMPLE MODE: Only processing up to page 5. !!!

--- Processing Page 1 of 5 ---
  - Result: Page 1 already processed. Loading from cache.
  - Cached file indicates a previous failure. Re-processing.
  - Step 1: Pre-processing page text...
  - Step 2: Calling LLM for JSON extraction...
    -> Sending 1831 characters to model 'llama3.3:70b-instruct-q4_K_M'...
   - ERROR: An unexpected error occurred (likely a timeout or connection issue): timed out
  - Step 3: Failed to extract valid JSON from this page. Skipping.
--- Page 1 finished in 300.01 seconds ---

--- Processing Page 2 of 5 ---
  - Result: Page 2 already processed. Loading from cache.
  - C