In [1]:
import pandas as pd
from tqdm import tqdm
from langchain.vectorstores.utils import filter_complex_metadata
from concurrent.futures import ThreadPoolExecutor, as_completed
import wptools
from langchain.docstore.document import Document
from typing import List
import re

In [2]:
def fetch_wikipedia_content(title: str):
    """
    Fetches only the infoboxes from the given Wikipedia title using wptools,
    parses them with mwparserfromhell, and returns a list of LangChain Documents.
    
    Each infobox in the page's wikitext is stored as a separate Document.
    """
    import wptools
    import mwparserfromhell
    from langchain.docstore.document import Document

    try:
        # Fetch the page content
        page = wptools.page(title, silent=True).get_parse()
        wikitext = page.data.get('wikitext', '')

        if not wikitext:
            return []

        # Parse the wikitext with mwparserfromhell
        parsed_wikitext = mwparserfromhell.parse(wikitext)

        infobox_docs = []

        # Look for templates whose name starts with "Infobox"
        for template in parsed_wikitext.filter_templates():
            if template.name.lower().strip().startswith('infobox'):
                # Collect key-value pairs
                infobox_pairs = []
                for param in template.params:
                    # Clean up name and value
                    key = param.name.strip()
                    value = param.value.strip_code().strip()
                    infobox_pairs.append(f"{key}: {value}")

                # Create a nicely formatted string for the infobox content
                infobox_text = "Infobox Content\n" + "\n".join(infobox_pairs)

                # Create a Document for this infobox
                doc = Document(
                    page_content=infobox_text,
                    metadata={"title": title}
                )

                infobox_docs.append(doc)

        return infobox_docs

    except Exception as e:
        print(f"Error fetching or parsing {title}: {e}")
        return []

In [3]:
csv_file = "../data/qid_to_titles.csv"
df = pd.read_csv(csv_file)  # The file has columns: QID, Wikipedia_Title

all_docs = []
successful_fetches = 0  # Counter for successful fetches

In [4]:
# Define the number of worker threads
MAX_WORKERS = 40  # Adjust based on your requirements and Colab's capabilities

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks to the executor
    future_to_title = {
        executor.submit(fetch_wikipedia_content, row["Wikipedia_Title"]): row["Wikipedia_Title"] 
        for _, row in df.iterrows()
    }

    # Use tqdm to display the progress bar
    for future in tqdm(as_completed(future_to_title), total=len(future_to_title), desc="Fetching Wikipedia content"):
        docs = future.result()
        if docs:
            all_docs.extend(docs)
            successful_fetches += 1

print(f"Successfully fetched {successful_fetches} documents out of {len(df)}.")

Fetching Wikipedia content: 100%|██████████| 13891/13891 [08:30<00:00, 27.21it/s]

Successfully fetched 13260 documents out of 13891.





In [5]:
import pickle

pickle_file_path = '../data/all_docs_infoboxes_final.pkl'

# After fetching all_docs
with open(pickle_file_path, "wb") as f:
    pickle.dump(all_docs, f)
    print(f"Successfully saved all_docs to {pickle_file_path}")

Successfully saved all_docs to ../data/all_docs_infoboxes_final.pkl
