In [1]:
import pandas as pd
from tqdm import tqdm
from langchain.vectorstores.utils import filter_complex_metadata
from concurrent.futures import ThreadPoolExecutor, as_completed
import wptools
from langchain.docstore.document import Document
import mwparserfromhell
import re
from typing import List, Tuple, Dict

In [2]:
def remove_references_and_comments(text: str) -> str:
    """
    Remove <ref>...</ref> tags, HTML comments, and any leftover reference templates
    or other wiki markup that isn't helpful for an LLM.
    """
    # Remove <ref>...</ref> including multiline
    text = re.sub(r"<ref[^>]*>.*?</ref>", "", text, flags=re.DOTALL)
    # Remove HTML comments <!-- ... -->
    text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
    # Optionally remove any leftover template references like {{refn|...}} etc.
    text = re.sub(r"\{\{[Rr]efn[^}]*\}\}", "", text, flags=re.DOTALL)
    return text.strip()

def parse_infobox(template) -> dict:
    """
    Convert a single Infobox template into a key-value dict of parameters.
    References and comments are removed from values.
    """
    infobox_data = {}
    for param in template.params:
        param_name = str(param.name).strip()
        # Use strip_code to remove wiki markup, then remove references
        param_value = param.value.strip_code(normalize=True, collapse=True)
        param_value = remove_references_and_comments(param_value)
        infobox_data[param_name] = param_value
    return infobox_data

def parse_table(table_node) -> List[List[str]]:
    """
    Parse a single <table> tag into a list-of-lists representation (like CSV rows).
    We strip out references/comments. You can adapt this to produce CSV or JSON.
    """

    # Instead of parsing str(table_node) (which can lead to extra HTML wrappers), 
    # parse the *contents* of the table_node. This is often a more direct approach.
    table_wikitext = table_node.contents
    parsed_table = mwparserfromhell.parse(table_wikitext)

    rows_data = []
    # Find all <tr> tags inside the table
    tr_tags = parsed_table.filter_tags(matches=lambda n: n.tag == 'tr')
    for tr in tr_tags:
        # For each <tr>, we look for <td> or <th> cells.
        # Use .contents to isolate what's inside that row.
        row_parsed = mwparserfromhell.parse(tr.contents)
        cells = row_parsed.filter_tags(matches=lambda n: n.tag in ['td', 'th'])

        row_values = []
        for cell in cells:
            # Convert wiki markup inside each cell to plain text
            cell_text = cell.contents.strip_code(normalize=True, collapse=True)
            cell_text = remove_references_and_comments(cell_text)
            row_values.append(cell_text)

        if row_values:
            rows_data.append(row_values)

    return rows_data

def parse_wikitext_for_content_infobox_tables(wikitext: str) -> Tuple[str, List[Dict], List[List[List[str]]]]:
    """
    Uses mwparserfromhell to parse the raw wikitext and extract:
      1) Plain article text (with most wiki markup removed)
      2) Infoboxes (templates containing 'infobox'), cleaned into key-value pairs
      3) Tables (actual <table> HTML tags), parsed into list-of-lists
    Returns (textual_content, infobox_dicts, tables_list_of_lists).
    """
    parsed = mwparserfromhell.parse(wikitext)

    # 1) Plain text (references removed)
    textual_content = parsed.strip_code(normalize=True, collapse=True)
    textual_content = remove_references_and_comments(textual_content)

    # 2) Extract infoboxes (templates containing "infobox" in the name)
    infobox_dicts = []
    for template in parsed.filter_templates():
        if "infobox" in template.name.lower():
            infobox_data = parse_infobox(template)
            infobox_dicts.append(infobox_data)

    # 3) Extract tables (<table> tags), parse them
    table_nodes = parsed.filter_tags(matches=lambda node: node.tag == 'table')
    tables_list = []
    for table_node in table_nodes:
        table_data = parse_table(table_node)
        # Only append if we actually got any rows from it
        if table_data:
            tables_list.append(table_data)

    return textual_content, infobox_dicts, tables_list

def fetch_wikipedia_content(title: str) -> List[Document]:
    """
    Fetch the textual content (wikitext) from a Wikipedia page using wptools.
    Parse it with mwparserfromhell and split it into three parts:
      - textual content
      - infoboxes
      - tables
    Returns a list of Documents, or an empty list if no content is found.
    """
    try:
        page = wptools.page(title, lang="en", silent=True)
        page.get_parse(show=False)

        # Retrieve the wikitext content
        text = page.data.get("wikitext")
        
        documents = []
        if text:
            article_text, infoboxes, tables = parse_wikitext_for_content_infobox_tables(text)

            # (1) Main article text
            if article_text.strip():
                text_doc = Document(
                    page_content=article_text.strip(),
                    metadata={"title": title, "type": "text"}
                )
                documents.append(text_doc)

            # (2) Infoboxes
            if infoboxes:
                infobox_strings = []
                for idx, ibox in enumerate(infoboxes, 1):
                    ibox_str = f"Infobox #{idx}:\n"
                    for k, v in ibox.items():
                        ibox_str += f"  {k}: {v}\n"
                    infobox_strings.append(ibox_str)
                
                infobox_doc = Document(
                    page_content="\n\n".join(infobox_strings),
                    metadata={"title": title, "type": "infoboxes"}
                )
                documents.append(infobox_doc)

            # (3) Tables
            if tables:
                table_strings = []
                for idx, table_data in enumerate(tables, 1):
                    table_str = f"Table #{idx}\n"
                    for row in table_data:
                        # Join columns by ' | ' (or a comma, etc.)
                        table_str += " | ".join(row) + "\n"
                    table_strings.append(table_str)
                
                tables_doc = Document(
                    page_content="\n\n".join(table_strings),
                    metadata={"title": title, "type": "tables"}
                )
                documents.append(tables_doc)

            # Filter out any complex metadata, if desired
            documents = filter_complex_metadata(documents)
        else:
            print(f"Warning: No text content found for '{title}'.")

        return documents

    except Exception as e:
        print(f"Error fetching '{title}': {e}")
        return []

In [3]:
csv_file = "../data/qid_to_titles.csv"
df = pd.read_csv(csv_file)  # The file has columns: QID, Wikipedia_Title

all_docs = []
successful_fetches = 0  # Counter for successful fetches

In [4]:
# Define the number of worker threads
MAX_WORKERS = 20  # Adjust based on your requirements and Colab's capabilities

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks to the executor
    future_to_title = {
        executor.submit(fetch_wikipedia_content, row["Wikipedia_Title"]): row["Wikipedia_Title"] 
        for _, row in df.iterrows()
    }

    # Use tqdm to display the progress bar
    for future in tqdm(as_completed(future_to_title), total=len(future_to_title), desc="Fetching Wikipedia content"):
        docs = future.result()
        if docs:
            all_docs.extend(docs)
            successful_fetches += 1

print(f"Successfully fetched {successful_fetches} documents out of {len(df)}.")

Fetching Wikipedia content: 100%|██████████| 13891/13891 [13:39<00:00, 16.96it/s]

Successfully fetched 13891 documents out of 13891.





In [5]:
import pickle

pickle_file_path = '../data/all_docs_final.pkl'

# After fetching all_docs
with open(pickle_file_path, "wb") as f:
    pickle.dump(all_docs, f)
    print(f"Successfully saved all_docs to {pickle_file_path}")

Successfully saved all_docs to ../data/all_docs_final.pkl
