In [8]:
import os
import gzip
import shutil
import requests
import xml.etree.ElementTree as ET
import json

# Wikipedia dump URL
WIKI_URL = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract.xml.gz"
WIKI_FILE_GZ = "enwiki-latest-abstract.xml.gz"
WIKI_FILE = "enwiki-latest-abstract.xml"
OUTPUT_FILE = "wiki_data.json"

# Step 1: Download Wikipedia dump
def download_wikipedia_dump():
    if not os.path.exists(WIKI_FILE_GZ):
        print("Downloading Wikipedia abstract dump...")
        response = requests.get(WIKI_URL, stream=True)
        with open(WIKI_FILE_GZ, "wb") as file:
            shutil.copyfileobj(response.raw, file)
        print("Download completed.")
    else:
        print("Wikipedia dump already downloaded.")

# Step 2: Extract the Wikipedia dump
def extract_wikipedia_dump():
    if not os.path.exists(WIKI_FILE):
        print("Extracting Wikipedia dump...")
        with gzip.open(WIKI_FILE_GZ, "rb") as f_in:
            with open(WIKI_FILE, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        print("Extraction completed.")
    else:
        print("Wikipedia dump already extracted.")

# Step 3: Parse Wikipedia XML and save as JSON
def parse_wiki_xml(file_path, max_entries=1000000):
    entries = []
    count = 0
    
    for event, elem in ET.iterparse(file_path, events=("end",)):
        if elem.tag == "doc":
            title = elem.find("title").text if elem.find("title") is not None else ""
            abstract = elem.find("abstract").text if elem.find("abstract") is not None else ""
            
            entries.append({"Title": title, "Abstract": abstract})
            
            count += 1
            if count % 100000 == 0:
                print(f"Processed {count} entries...")
            
            if count >= max_entries:
                break
            
            elem.clear()
    
    return entries

if __name__ == "__main__":
    download_wikipedia_dump()
    extract_wikipedia_dump()
    
    print("Parsing Wikipedia XML file...")
    wiki_data = parse_wiki_xml(WIKI_FILE, max_entries=1000000)
    
    print(f"Saving {len(wiki_data)} entries to {OUTPUT_FILE}...")
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(wiki_data, f, ensure_ascii=False, indent=2)
    
    print("Processing complete!")

Wikipedia dump already downloaded.
Wikipedia dump already extracted.
Parsing Wikipedia XML file...
Processed 100000 entries...
Processed 200000 entries...
Processed 300000 entries...
Processed 400000 entries...
Processed 500000 entries...
Processed 600000 entries...
Processed 700000 entries...
Processed 800000 entries...
Processed 900000 entries...
Processed 1000000 entries...
Saving 1000000 entries to wiki_data.json...
Processing complete!
