in terminal:
conda init
conda activate china_dir

In [None]:
%pip install -U PyMuPDF
%pip install -U ollama

In [18]:
import fitz  # PyMuPDF
import json
import sys
import os
import re
import ollama

In [2]:
!pwd

/Users/lukasfiller/dev/china_directory


In [None]:
# --- START OF CONFIGURATION ---
# Change these values for your specific run
PDF_PATH = "/Users/lukasfiller/dev/china_directory/China Directory 2024-2p5-50.pdf"
OUTPUT_PATH = "output.json"
MODEL_NAME = "llama3.1:8b-instruct-fp16"
OLLAMA_HOST = "http://127.0.0.1:11434"
OLLAMA_CLIENT_TIMEOUT = 600  # Seconds
DEBUG_PRINT_JSON = True

# --- NEW: Set a limit for testing ---
# Set to a number (e.g., 5) for a sample run, or None to process the whole document.
MAX_PAGES_TO_PROCESS = 2 
# --- END OF CONFIGURATION ---


# --- END OF CONFIGURATION ---

# This is the instruction set we developed, telling the LLM its role and rules.
SYSTEM_PROMPT = """
Extract information from Chinese government directory text and return as JSON array.

INPUT FORMAT: Text with "||" separating left/right columns, names with birth dates like (YY.MM)

OUTPUT: JSON array with this structure:
[
  {
    "organization_name_english": "State Council",
    "organization_name_chinese": "国务院", 
    "document_section_title": null,
    "metadata": {},
    "sub_organizations": [],
    "positions": [
      {
        "title_english": "Premier",
        "title_chinese": "总理",
        "metadata": {"count": null, "list_order_note": null},
        "personnel": [
          {
            "name_pinyin": "Li Keqiang",
            "name_chinese": "李克强",
            "dob_year": 1955,
            "dob_month": 3,
            "assumed_office_date": null,
            "cross_reference_symbols": [],
            "gender": "male",
            "ethnicity": "Han",
            "rank": null,
            "rank_chinese": null,
            "other_notes": []
          }
        ]
      }
    ]
  }
]

RULES:
- Extract person names and birth dates from (YY.MM) format
- YY < 30 = 20XX, YY >= 30 = 19XX
- Default gender: "male", ethnicity: "Han"
- Return ONLY valid JSON array, no explanations
"""

In [12]:
# In Cell 3
def preprocess_pdf_page(page):
    """
    Extracts text from a PDF page and reconstructs the two-column layout
    into a structured markdown-like format that is easier for the LLM to parse.
    """
    # Get words with coordinates
    words = page.get_text("words")
    if not words:
        return ""

    # A simple heuristic for the center column split
    page_center = page.rect.width / 2

    # Group words into lines based on vertical position (y0)
    lines = {}
    for w in words:
        x0, y0, x1, y1, text = w[:5]
        # Use the integer part of y0 as the key to group words on the same line
        y_key = int(y0)
        if y_key not in lines:
            lines[y_key] = []
        lines[y_key].append(w)

    # Sort words within each line by their horizontal position (x0)
    for y_key in lines:
        lines[y_key].sort(key=lambda w: w[0])

    # Reconstruct the page text, aligning left and right columns
    processed_text = []
    sorted_y_keys = sorted(lines.keys())

    for y_key in sorted_y_keys:
        line_words = lines[y_key]
        left_col_text = " ".join([w[4] for w in line_words if w[0] < page_center])
        right_col_text = " ".join([w[4] for w in line_words if w[2] > page_center])

        # Heuristic for detecting headers (usually centered or only in one column)
        is_header = (not left_col_text or not right_col_text) and (len(line_words) < 5)

        if is_header:
            # For headers, just use the full line text
            full_line_text = " ".join([w[4] for w in line_words])
            processed_text.append(f"\n# {full_line_text}\n") # Use markdown header for emphasis
        else:
            # For data rows, use a clear separator
            processed_text.append(f"{left_col_text.strip()} || {right_col_text.strip()}")

    return "\n".join(processed_text)

# In Cell 3
def get_json_from_llm(page_text, model_name, host):
    """
    Sends the pre-processed page text to a specific Ollama host and gets a JSON response.
    """
    page_text_size = len(page_text)
    print(f"    -> Sending {page_text_size} characters to model '{model_name}'...")
    content = ""

    try:
        client = ollama.Client(host=host, timeout=OLLAMA_CLIENT_TIMEOUT)
        response = client.chat(
            model=model_name,
            messages=[
                {'role': 'system', 'content': SYSTEM_PROMPT},
                {'role': 'user', 'content': page_text}
            ],
            options={'temperature': 0.0}
            # REMOVED: format='json' - this was causing issues
        )
        print("    -> Received response from model.")
        content = response['message']['content']
        
        print(f"    -> Raw response (first 200 chars): {content[:200]}")
        
        # Try to extract JSON from the response
        content = content.strip()
        
        # Remove markdown code blocks if present
        if content.startswith('```json'):
            content = content[7:]
        if content.endswith('```'):
            content = content[:-3]
        content = content.strip()
        
        # Try to find JSON array in the response
        import re
        json_match = re.search(r'\[.*\]', content, re.DOTALL)
        if json_match:
            content = json_match.group(0)
        
        parsed_json = json.loads(content)

        if isinstance(parsed_json, dict):
            print("    -> LLM returned a dictionary. Wrapping it in a list.")
            return [parsed_json]
        elif isinstance(parsed_json, list):
            print("    -> Successfully parsed JSON list from response.")
            return parsed_json
        else:
            print(f"   - ERROR: Unexpected JSON type: {type(parsed_json)}")
            return None
            
    except json.JSONDecodeError as e:
        print(f"   - ERROR: Failed to parse JSON. Error: {e}")
        print(f"   - Content: {content}")
        return None
    except Exception as e:
        print(f"   - ERROR: {e}")
        return None
        
# In Cell 3
def stitch_json_results(all_pages_data):
    """
    Merges the list of JSON objects from each page into a single,
    hierarchically correct JSON structure. Now with improved error handling.
    """
    if not all_pages_data: return []
    final_data = []
    
    for i, page_data in enumerate(all_pages_data):
        if not page_data: continue

        for org_data in page_data:
            # --- NEW ROBUSTNESS CHECK ---
            # Ensure org_data is a dictionary with the required key
            if not isinstance(org_data, dict) or 'organization_name_english' not in org_data:
                print(f"  - WARNING: Skipping malformed organization data on page {i+1}. Data was: {org_data}")
                continue
            # --- END ROBUSTNESS CHECK ---

            # Check if this organization is a continuation of the previous one
            if (final_data and final_data[-1].get('organization_name_english') == org_data.get('organization_name_english')):
                last_org = final_data[-1]
                if org_data.get('positions'):
                    if (last_org.get('positions') and org_data.get('positions') and last_org['positions'][-1].get('title_english') == org_data['positions'][0].get('title_english')):
                        last_org['positions'][-1]['personnel'].extend(org_data['positions'][0].get('personnel', []))
                        last_org['positions'].extend(org_data['positions'][1:])
                    else:
                        last_org['positions'].extend(org_data['positions'])
                if org_data.get('sub_organizations'):
                    last_org['sub_organizations'].extend(org_data.get('sub_organizations', []))
            else:
                final_data.append(org_data)
                
    return final_data

# In Cell 3, after the stitch_json_results function
def save_data_to_file(data, filepath):
    """Saves the provided data structure to a JSON file."""
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"  - CRITICAL WARNING: Failed to save data to {filepath}. Error: {e}")

In [None]:
# In Cell 4 (Execution)
import time

print(f"Starting processing for '{PDF_PATH}'...")
print(f"Using model: '{MODEL_NAME}' at host '{OLLAMA_HOST}'")
print("-" * 50)

# --- RESUME LOGIC: Load existing data if it exists ---
all_pages_data = []
if os.path.exists(OUTPUT_PATH):
    print(f"Found existing output file at '{OUTPUT_PATH}'. Attempting to load and resume.")
    try:
        with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
            print("Starting a fresh run, but will save progress incrementally.")
    except Exception as e:
        print(f"  - WARNING: Could not read existing file. Starting fresh. Error: {e}")

all_pages_data = [] # This will hold the JSON from each page
start_page = 0      # Default to starting from the beginning

# A better resume strategy:
# We will create a temporary cache directory to store the result of each page.
CACHE_DIR = "page_cache"
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

# --- NEW: OLLAMA SERVER PRE-CHECK ---
print(f"Attempting to connect to Ollama server at {OLLAMA_HOST}...")
try:
    client = ollama.Client(host=OLLAMA_HOST, timeout=10) # Short timeout for pre-check
    client.list() # A lightweight command to check connectivity
    print("  -> Successfully connected to Ollama server.")
except Exception as e:
    print(f"  - FATAL ERROR: Could not connect to Ollama server at {OLLAMA_HOST}.")
    print(f"  - Error details: {e}")
    print("  - Please ensure Ollama is running and accessible.")
    ollama_available = False
else:
    ollama_available = True

if ollama_available:
    if not os.path.exists(PDF_PATH):
        print(f"FATAL ERROR: PDF file not found at '{PDF_PATH}'")
    else:
        doc = fitz.open(PDF_PATH)
        num_pages = len(doc)
        
        pages_to_run = num_pages
        if MAX_PAGES_TO_PROCESS is not None and MAX_PAGES_TO_PROCESS < num_pages:
            print("=" * 50, f"\n!!! SAMPLE MODE: Only processing up to page {MAX_PAGES_TO_PROCESS}. !!!\n", "=" * 50)
            pages_to_run = MAX_PAGES_TO_PROCESS

        total_start_time = time.time()
        successful_pages_count = 0
        attempted_pages_count = 0
        
        # --- MAIN LOOP WITH INCREMENTAL SAVING ---
        for i in range(pages_to_run):
            page_num = i + 1
            page_cache_path = os.path.join(CACHE_DIR, f"page_{page_num}.json")
            page_successfully_processed_or_cached = False # Flag for this page
            
            print(f"\n--- Processing Page {page_num} of {pages_to_run} ---")

            # --- RESUME LOGIC: Check if page is already cached ---
            if os.path.exists(page_cache_path):
                print(f"  - Result: Page {page_num} already processed. Loading from cache.")
                try:
                    with open(page_cache_path, 'r', encoding='utf-8') as f:
                        cached_data = json.load(f)
                    if cached_data is None: # Indicates a previous failure
                        print("  - Cached file indicates a previous failure. Re-processing.")
                        # Do not append to all_pages_data yet, proceed to reprocessing
                    else:
                        print("  - Successfully loaded valid data from cache.")
                        all_pages_data.append(cached_data) # Add valid cached data
                        successful_pages_count += 1
                        page_successfully_processed_or_cached = True
                except json.JSONDecodeError:
                    print("  - WARNING: Cache file is corrupted. Re-processing.")
                except Exception as e:
                    print(f"  - WARNING: Could not load from cache ({e}). Re-processing.")

            if page_successfully_processed_or_cached:
                continue # Skip to the next page
            
            # --- If not cached, or cache indicated failure, process the page normally ---
            page_start_time = time.time()
            page = doc[i]
            
            print("  - Step 1: Pre-processing page text...")
            page_text = preprocess_pdf_page(page)
            
            if not page_text.strip():
                print("  - Result: Page is empty, skipping.")
                # Cache the empty result so we don't re-process it
                with open(page_cache_path, 'w', encoding='utf-8') as f: json.dump(None, f)
                continue
            
            print("  - Step 2: Calling LLM for JSON extraction...")
            print("--- DEBUG: Preprocessed Page Text START ---")
            print(page_text)
            print("--- DEBUG: Preprocessed Page Text END ---")

            page_json = get_json_from_llm(page_text, MODEL_NAME, OLLAMA_HOST)
            
            # Cache the result, whether it's successful (JSON) or a failure (None)
            with open(page_cache_path, 'w', encoding='utf-8') as f:
                json.dump(page_json, f, ensure_ascii=False, indent=2)

            if page_json:
                all_pages_data.append(page_json)
                successful_pages_count += 1
                num_orgs = len(page_json)
                print(f"  - Step 3: Success! Extracted {num_orgs} top-level organization(s) from page.")
                if DEBUG_PRINT_JSON:
                    print("    -- Debug: JSON output for this page --"); print(json.dumps(page_json, ensure_ascii=False, indent=2)); print("    -- End Debug --")
            else:
                print("  - Step 3: Failed to extract valid JSON from this page. Skipping.")
            
            page_end_time = time.time()
            page_duration = page_end_time - page_start_time
            print(f"--- Page {page_num} finished in {page_duration:.2f} seconds ---")

        # --- FINAL STITCHING AND SAVING --- (FIX: Moved this inside the else block)
        print("\n" + "=" * 50)
        print("All pages processed or loaded from cache. Now stitching final results...")
        
        final_stitched_data = stitch_json_results(all_pages_data)
        print("Stitching complete.")

        print(f"Writing final structured data to '{OUTPUT_PATH}'...")
        save_data_to_file(final_stitched_data, OUTPUT_PATH)

        total_end_time = time.time()
        total_duration = total_end_time - total_start_time
        print("\n" + "=" * 50)
        print("✅ PROCESSING COMPLETE ✅")
        print(f"Attempted to process {pages_to_run} page(s).")
        print(f"Successfully processed and extracted data from {successful_pages_count} page(s).")
        print(f"Total execution time: {total_duration:.2f} seconds.")
        print(f"Final output saved to: {OUTPUT_PATH}")
        if successful_pages_count == 0 and pages_to_run > 0:
            print(f"WARNING: No data was successfully extracted. {OUTPUT_PATH} will contain an empty list.")
elif not ollama_available:
    print("Skipped PDF processing as Ollama server was not available.")
else: # Ollama available, but PDF might have been missing or other setup issue
    print("Skipped PDF processing due to missing PDF file or other setup issue prior to page loop.")

In [None]:
# Add this debugging cell to check the cache contents
import json
import os

CACHE_DIR = "page_cache"
print("=== CACHE DIRECTORY CONTENTS ===")

if os.path.exists(CACHE_DIR):
    cache_files = [f for f in os.listdir(CACHE_DIR) if f.endswith('.json')]
    print(f"Found {len(cache_files)} cache files: {cache_files}")
    
    for cache_file in sorted(cache_files):
        cache_path = os.path.join(CACHE_DIR, cache_file)
        print(f"\n--- {cache_file} ---")
        try:
            with open(cache_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if data is None:
                print("  Content: None (indicates processing failure)")
            elif isinstance(data, list):
                print(f"  Content: List with {len(data)} items")
                if data:
                    print(f"  First item keys: {list(data[0].keys()) if isinstance(data[0], dict) else 'Not a dict'}")
            else:
                print(f"  Content: {type(data)} - {str(data)[:100]}...")
        except Exception as e:
            print(f"  Error reading file: {e}")
else:
    print("Cache directory does not exist")

In [None]:
# Add this enhanced debugging cell to see the actual content
import json
import os

CACHE_DIR = "page_cache"
print("=== DETAILED CACHE ANALYSIS ===")

if os.path.exists(CACHE_DIR):
    cache_files = [f for f in os.listdir(CACHE_DIR) if f.endswith('.json')]
    print(f"Found {len(cache_files)} cache files: {cache_files}")
    
    for cache_file in sorted(cache_files):
        cache_path = os.path.join(CACHE_DIR, cache_file)
        print(f"\n--- {cache_file} ---")
        try:
            with open(cache_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if data is None:
                print("  Content: None (indicates processing failure)")
            elif isinstance(data, list):
                print(f"  Content: List with {len(data)} items")
                for i, item in enumerate(data):
                    print(f"  Item {i+1}: {type(item)}")
                    if isinstance(item, dict):
                        print(f"    Keys: {list(item.keys())}")
                        print(f"    Full content: {json.dumps(item, ensure_ascii=False, indent=4)}")
                    else:
                        print(f"    Content: {item}")
            else:
                print(f"  Content: {type(data)} - {str(data)[:200]}...")
        except Exception as e:
            print(f"  Error reading file: {e}")
else:
    print("Cache directory does not exist")

In [None]:
# Test LLM response with a simple example
import ollama

test_text = """
# State Council
Premier Li Keqiang (65.03) || Deputy Premier Han Zheng (68.04)
State Councilor Wei Fenghe (67.02) || State Councilor Wang Yi (69.10)
"""

print("=== TESTING LLM RESPONSE ===")
print(f"Input text:\n{test_text}")

try:
    client = ollama.Client(host=OLLAMA_HOST, timeout=60)
    response = client.chat(
        model=MODEL_NAME,
        messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': test_text}
        ],
        options={'temperature': 0.0},
        format='json'
    )
    
    content = response['message']['content']
    print(f"\nRaw LLM response:\n{content}")
    
    try:
        parsed = json.loads(content)
        print(f"\nParsed JSON type: {type(parsed)}")
        print(f"Parsed JSON content:\n{json.dumps(parsed, ensure_ascii=False, indent=2)}")
    except json.JSONDecodeError as e:
        print(f"\nJSON parsing failed: {e}")
        
except Exception as e:
    print(f"Error: {e}")

In [17]:
# Clear cache cell
import os
import shutil

CACHE_DIR = "page_cache"
if os.path.exists(CACHE_DIR):
    shutil.rmtree(CACHE_DIR)
    print("Cache cleared")
    
# Also clear output file
if os.path.exists("output.json"):
    os.remove("output.json")
    print("Output file cleared")