In [1]:
%load_ext autoreload
%autoreload 2
from engine.parse_wiki_markup import wikimarkup_to_html

SyntaxError: unterminated f-string literal (detected at line 53) (parse_wiki_markup.py, line 53)

In [None]:
from engine.llm import clean_json_response, call_ollama

In [96]:
import time
import pandas as pd
# line_profiler injects '@profile' into the built-ins when run via kernprof
# If running normally, we need a dummy decorator to avoid NameError
try:
    @profile
    def func(): pass
except NameError:
    def profile(func): return func

In [None]:
import pandas as pd
import json
from io import StringIO
from typing import List, Dict, Optional, Any

# --- PROFILER SAFETY SHIM ---
# This allows the code to run even if you aren't using kernprof/line_profiler
try:
    @profile
    def _dummy(): pass
except NameError:
    def profile(func): return func
# ----------------------------

import requests
import json
import re

def clean_llm_json(text: str) -> str:
    """
    Strips markdown code blocks (```json ... ```) often added by LLMs.
    Returns the raw JSON string.
    """
    # specific regex to capture content inside ```json ... ``` or just ``` ... ```
    match = re.search(r"```(?:json)?\s*(.*)\s*```", text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text.strip()

def call_ollama(system_prompt: str, user_content: str, model: str = "llama3") -> str:
    """
    A real, fully fleshed-out call to a local Ollama instance.
    """
    url = "http://localhost:11434/api/chat"
    
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content}
        ],
        "stream": False,
        "format": "json",  # CRITICAL: Forces the model to output valid JSON structure
        "options": {
            "temperature": 0.1,  # Low temp = more deterministic/factual
            "num_ctx": 4096      # Context window size
        }
    }

    try:
        response = requests.post(url, json=payload, timeout=30)
        response.raise_for_status()
        
        result = response.json()
        raw_content = result['message']['content']
        
        # Even with "format": "json", models sometimes wrap it in markdown
        return clean_llm_json(raw_content)

    except requests.exceptions.ConnectionError:
        print("❌ Error: Could not connect to Ollama. Is it running? (http://localhost:11434)")
        return "{}"
    except Exception as e:
        print(f"❌ LLM Error: {e}")
        return "{}"

# --- HOW TO USE ---
# extractor = WikiTableExtractor(call_ollama)



In [98]:
from engine.articles import yield_wiki_articles
from engine.parse_wiki_markup import parse_wiki_markup


def all_articles():
    BIG_FILE = r"C:\\Users\\Lenovo\\Downloads\\wikidump\\enwiki-20250501-pages-articles.xml.bz2"
    return yield_wiki_articles(BIG_FILE)

In [None]:
from typing import Iterator
from pandas import DataFrame
from engine.extractor import WikiTableExtractor
# Assuming these exist in your scope from previous steps:
extractor = WikiTableExtractor(call_ollama)


def has_rows(df: DataFrame) -> bool:
    return not df.empty

@profile
def person_location_year_df() -> Iterator[DataFrame]:
    tables_found = 0
    tables_attempted = 0

    # 1. Loop through the generator, don't just call next() once
    for article in all_articles():
        
        # Guard clause in case article format varies
        if 'text' not in article:
            continue

        parts = parse_wiki_markup(article['text'])
        
        # Safely get tables, defaulting to empty list if key missing
        tables = parts.get('tables', []) 

        for table in tables:
            tables_attempted += 1
            
            # Optional: Print status every 10 tables to reduce console noise
            if tables_attempted % 10 == 0:
                print(f"Attempts: {tables_attempted} | Found: {tables_found}")

            try:
                # 2. Conversion Pipeline
                html_table = wikimarkup_to_html(table)
                
                # 3. LLM + Pandas Extraction
                # Note: process_page_html returns an empty DF if no people found
                df = extractor.process_page_html(html_table)

                if has_rows(df):
                    tables_found += 1
                    yield df

            except Exception as e:
                # 4. Error Handling
                # If a specific table causes a crash (e.g., malformed HTML), 
                # log it and continue to the next table/article.
                print(f"Error processing table index {tables_attempted}: {e}")
                continue

In [100]:
import wikitextparser as wtp

# We redefine this function explicitly in the notebook to override the external file
def parse_wiki_markup(wiki_text):
    parsed = wtp.parse(wiki_text)
    
    # Extract tables
    tables = [t.string for t in parsed.tables]
    
    # Extract bullet lists
    bullet_groups = []
    for l in parsed.get_lists():
        bullet_groups.append(l.items)
        
    # Extract infoboxes
    infoboxes = []
    for t in parsed.templates:
        if "infobox" in t.name.lower():
            infoboxes.append(t.string)

    # Clean lists from text to avoid duplication
    for l in parsed.get_lists():
        l.string = ""

    # --- THE FIX IS HERE ---
    try:
        raw_text = parsed.plain_text().strip()
    except (AttributeError, Exception):
        # If wikitextparser crashes, return empty string so pipeline survives
        raw_text = ""
    # -----------------------

    return {
        "tables": tables,
        "bullets": bullet_groups,
        "infoboxes": infoboxes,
        "raw_text": raw_text
    }

print("Function patched successfully!")

Function patched successfully!


In [101]:
# 1. Initialize the generator
dataframes = person_location_year_df()

# 2. Define a helper to profile a single step
def step():
    try:
        return next(dataframes)
    except StopIteration:
        return None

# 3. Run the profiler on the helper
%lprun -f extractor.process_page_html -f step step()

Attempts: 10 | Found: 0
Attempts: 20 | Found: 0


Timer unit: 1e-07 s

Total time: 601.919 s
File: C:\Users\Lenovo\AppData\Local\Temp\ipykernel_7588\2204833369.py
Function: step at line 5

Line #      Hits         Time  Per Hit   % Time  Line Contents
     5                                           def step():
     6         1         47.0     47.0      0.0      try:
     7         1 6019190539.0 6.02e+09    100.0          return next(dataframes)
     8                                               except StopIteration:
     9                                                   return None

Total time: 597.316 s
File: C:\Users\Lenovo\AppData\Local\Temp\ipykernel_7588\4033124426.py
Function: WikiTableExtractor.process_page_html at line 127

Line #      Hits         Time  Per Hit   % Time  Line Contents
   127                                               @profile  # <--- PROFILING POINT 3: Measures Pandas operations vs LLM wait time
   128                                               def process_page_html(self, html_content: str) -> pd