in terminal:
conda init
conda activate china_dir

In [12]:
%pip install -U PyMuPDF
%pip install -U ollama

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [13]:
import fitz  # PyMuPDF
import json
import sys
import os
import re
import ollama

In [3]:
!pwd

/Users/lukasfiller/dev/china_directory


In [17]:
# --- START OF CONFIGURATION ---
# Change these values for your specific run
PDF_PATH = "/Users/lukasfiller/Library/CloudStorage/OneDrive-DKIAsia-PacificCenterforSecurityStudies/APCSS_Docs/topics/PLA Everything/China Directory 2024-2p5-50.pdf"
OUTPUT_PATH = "output.json"
MODEL_NAME = "phi4-reasoning:14b-plus-fp16"
OLLAMA_HOST = "http://127.0.0.1:11434" 
# --- END OF CONFIGURATION ---

# This is the instruction set we developed, telling the LLM its role and rules.
SYSTEM_PROMPT = """
You are an expert data extraction system. Your sole function is to parse the provided text from a single page of a Chinese party-state leadership directory and convert its contents into a valid JSON array. Adhere strictly to the schema and parsing rules. Produce only the final JSON array as your output, without any commentary, apologies, or markdown code fences.

The input text is from a two-column document. I have pre-processed it by merging lines from the left and right columns. A "||" separator often indicates the split between the columns.

**Required Output JSON Schema:**
[
  {
    "organization_name_english": "string",
    "organization_name_chinese": "string",
    "document_section_title": "string | null",
    "metadata": {},
    "sub_organizations": [],
    "positions": [
      {
        "title_english": "string",
        "title_chinese": "string",
        "metadata": { "count": "integer | null", "list_order_note": "string | null" },
        "personnel": [
          {
            "name_pinyin": "string",
            "name_chinese": "string",
            "dob_year": "integer | null",
            "dob_month": "integer | null",
            "assumed_office_date": "YYYY-MM-DD" | "YYYY-MM" | "YYYY" | null,
            "cross_reference_symbols": ["string"],
            "gender": "male" | "female",
            "ethnicity": "string",
            "rank": "string | null",
            "rank_chinese": "string | null",
            "other_notes": ["string"]
          }
        ]
      }
    ]
  }
]

**Detailed Parsing Rules:**
- **Hierarchy:** Capture the nested structure of organizations, sub-organizations, and positions.
- **`dob_year` / `dob_month`**: Parse from `(YY.MM)`. A `YY` < 30 is 20xx; otherwise, it's 19xx.
- **`assumed_office_date`**: Parse from the `YY.MM.DD` format into ISO 8601 `YYYY-MM-DD`.
- **`cross_reference_symbols`**: Collect any leading `☆`, `※`, `◎`, `○` symbols.
- **`gender`**: If `(f)` or `(女)` is present, set to "female". **Default is "male".**
- **`ethnicity`**: If an ethnicity like `(Mongolian)` or `(蒙古族)` is present, record it. **Default is "Han".**
- **`rank` & `rank_chinese`**: Map abbreviations like `(Gen)` to `rank` and the Chinese `(上将)` to `rank_chinese`. **Default is `null` if no rank is specified.**
- **`other_notes`**: Place any other parenthetical notes like `(executive)` or `(SPC)` here.
- **Continuations:** If the page seems to continue a list from a previous page (e.g., starts with a list of names without a new header), structure the JSON as if it belongs to the last-mentioned organization/position. The top-level object in your response should reflect this context.
"""

In [18]:
def preprocess_pdf_page(page):
    """
    Extracts text from a PDF page and attempts to reconstruct the two-column layout
    into a single, coherent text block for the LLM.
    """
    blocks = page.get_text("blocks")
    blocks.sort(key=lambda b: (b[1], b[0]))
    page_center = page.rect.width / 2
    merged_lines = {}
    for b in blocks:
        y_center = (b[1] + b[3]) / 2
        y_key = round(y_center / 10) * 10
        text = b[4].strip().replace('\n', ' ')
        if not text: continue
        if y_key not in merged_lines: merged_lines[y_key] = {'left': [], 'right': []}
        if b[0] < page_center: merged_lines[y_key]['left'].append(text)
        else: merged_lines[y_key]['right'].append(text)
    processed_text = []
    for y_key in sorted(merged_lines.keys()):
        left_text = " ".join(merged_lines[y_key]['left'])
        right_text = " ".join(merged_lines[y_key]['right'])
        if left_text and right_text: processed_text.append(f"{left_text} || {right_text}")
        elif left_text: processed_text.append(left_text)
        elif right_text: processed_text.append(right_text)
    return "\n".join(processed_text)

def get_json_from_llm(page_text, model_name, host):
    """
    Sends the pre-processed page text to a specific Ollama host and gets a JSON response.
    """
    try:
        # Create a client that points to your specific Ollama server
        client = ollama.Client(host=host)

        # Use the client to make the chat request
        response = client.chat(
            model=model_name,
            messages=[
                {'role': 'system', 'content': SYSTEM_PROMPT},
                {'role': 'user', 'content': page_text}
            ],
            options={'temperature': 0.0},
            format='json'
        )
        content = response['message']['content']
        return json.loads(content)

    except ollama.ResponseError as e:
        print(f"   - An error occurred with the Ollama API: {e.error}")
        print(f"   - Status code: {e.status_code}")
        return None
    except Exception as e:
        print(f"   - An unexpected error occurred: {e}")
        raw_content = "N/A"
        if 'response' in locals() and response:
            raw_content = response.get('message', {}).get('content', 'N/A')
        print(f"   - Raw response content: {raw_content}")
        return None
    
def stitch_json_results(all_pages_data):
    """
    Merges the list of JSON objects from each page into a single,
    hierarchically correct JSON structure.
    """
    if not all_pages_data: return []
    final_data = []
    for page_data in all_pages_data:
        if not page_data: continue
        for org_data in page_data:
            if (final_data and final_data[-1]['organization_name_english'] == org_data['organization_name_english']):
                last_org = final_data[-1]
                if org_data.get('positions'):
                    if (last_org['positions'] and org_data['positions'] and last_org['positions'][-1]['title_english'] == org_data['positions'][0]['title_english']):
                        last_org['positions'][-1]['personnel'].extend(org_data['positions'][0]['personnel'])
                        last_org['positions'].extend(org_data['positions'][1:])
                    else:
                        last_org['positions'].extend(org_data['positions'])
                if org_data.get('sub_organizations'):
                    last_org['sub_organizations'].extend(org_data.get('sub_organizations', []))
            else:
                final_data.append(org_data)
    return final_data

In [None]:
print(f"Processing '{PDF_PATH}' with model '{MODEL_NAME}'...")

if not os.path.exists(PDF_PATH):
    print(f"Error: PDF file not found at '{PDF_PATH}'")
else:
    doc = fitz.open(PDF_PATH)
    num_pages = len(doc)
    all_pages_data = []

    for i, page in enumerate(doc):
        print(f"- Processing Page {i + 1} of {num_pages}...")
        page_text = preprocess_pdf_page(page)
        
        if not page_text.strip():
            print("  - Page is empty, skipping.")
            continue
        
        page_json = get_json_from_llm(page_text, MODEL_NAME, OLLAMA_HOST)
        
        if page_json:
            print(f"  - Successfully extracted JSON from page {i+1}.")
            all_pages_data.append(page_json)
        else:
            print(f"  - Failed to extract JSON from page {i+1}. It might be empty or have caused an error.")

    print("\nStitching JSON data from all pages...")
    final_stitched_data = stitch_json_results(all_pages_data)

    print(f"Writing final structured data to '{OUTPUT_PATH}'...")
    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        json.dump(final_stitched_data, f, ensure_ascii=False, indent=2)

    print("\nProcessing complete!")

Processing '/Users/lukasfiller/Library/CloudStorage/OneDrive-DKIAsia-PacificCenterforSecurityStudies/APCSS_Docs/topics/PLA Everything/China Directory 2024-2p5-50.pdf' with model 'phi4-reasoning:14b-plus-fp16'...
- Processing Page 1 of 46...
  - Successfully extracted JSON from page 1.
- Processing Page 2 of 46...
