#Wikipedia Word List Collector

This code try to extract lemmas from wikipedia dump file and make 26 txt file each representing set of words with a particular first letter

In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import os
import time
import html

# Paths
xml_file_path = "/content/drive/MyDrive/tugas-akhir/idwiki-latest-pages-articles.xml"
output_dir = "/content/drive/MyDrive/tugas-akhir/dicts/tokenized-idwiki-dict"
os.makedirs(output_dir, exist_ok=True)

tokens_by_letter = defaultdict(set)

# Regular expressions
url_pattern = re.compile(r'\b(?:https?://|www\.|[a-zA-Z0-9.-]+\.[a-z]{2,})(/[^\s]*)?\b')
html_attr_pattern = re.compile(r'\b(?:style|class|id|color|width|height|bgcolor|align)="[^"]*"')

# Clean and tokenize
def clean_and_tokenize(text):
    if not text:
        return []

    text = html.unescape(text)

    # Remove HTML attributes like color="blue", style="...", etc.
    text = re.sub(html_attr_pattern, '', text)

    # Remove all kinds of links (http, www, google.com, etc.)
    text = re.sub(url_pattern, '', text)

    # Remove wikitext and HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)          # HTML
    text = re.sub(r'\[\[.*?\]\]', ' ', text)      # Wiki links
    text = re.sub(r'\{\{.*?\}\}', ' ', text)      # Wiki templates
    text = re.sub(r'\|.*?\|', ' ', text)          # Table cells

    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = text.split()

    # Remove tokens with numbers or non-ASCII characters
    tokens = [t for t in tokens if t.isalpha() and all(ord(c) < 128 for c in t)]

    return tokens

# Count total lines
print("Counting total lines...")
with open(xml_file_path, 'r', encoding='utf-8') as f:
    total_lines = sum(1 for _ in f)
print(f"Estimated total lines: {total_lines:,}\n")

# Parse XML
print("Starting XML parsing and tokenization...")
start_time = time.time()
line_count = 0
report_every = 50000

skip_title = "Berkas:Brigjen Patar Sahat Panggabean.jpeg"
skip_user = "Flow talk page manager"

inside_page = False
skip_this_page = False
current_title = ""
current_username = ""

for event, elem in ET.iterparse(xml_file_path, events=('start', 'end')):
    tag = elem.tag.split('}')[-1]

    if event == 'start' and tag == 'page':
        inside_page = True
        skip_this_page = False
        current_title = ""
        current_username = ""

    if event == 'end':
        if tag == 'title':
            current_title = elem.text or ""
            if current_title == skip_title:
                skip_this_page = True

        elif tag == 'username':
            current_username = elem.text or ""
            if current_username == skip_user:
                skip_this_page = True

        elif tag == 'text' and not skip_this_page:
            line_count += 1
            tokens = clean_and_tokenize(elem.text or "")
            for token in tokens:
                if token:
                    tokens_by_letter[token[0].lower()].add(token)

            if line_count % report_every == 0:
                pct = (line_count / total_lines) * 100
                print(f"[Tokenizing] Line {line_count:,} / {total_lines:,} ({pct:.2f}%)")

        elif tag == 'page':
            inside_page = False
            skip_this_page = False
            current_title = ""
            current_username = ""
            elem.clear()

print(f"\nTokenizing complete in {time.time() - start_time:.2f} seconds.\n")

# Write output files
print("Writing token files...")
for i, letter in enumerate("abcdefghijklmnopqrstuvwxyz"):
    file_path = os.path.join(output_dir, f"{letter}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        for token in sorted(tokens_by_letter[letter]):
            f.write(token + "\n")
    print(f"[Filing] {letter}.txt ({i+1}/26)")

print("\n✅ Done! All token files saved to:", output_dir)


Counting total lines...
Estimated total lines: 95,438,835

Starting XML parsing and tokenization...
[Tokenizing] Line 50,000 / 95,438,835 (0.05%)
[Tokenizing] Line 100,000 / 95,438,835 (0.10%)
[Tokenizing] Line 150,000 / 95,438,835 (0.16%)
[Tokenizing] Line 200,000 / 95,438,835 (0.21%)
[Tokenizing] Line 250,000 / 95,438,835 (0.26%)
[Tokenizing] Line 300,000 / 95,438,835 (0.31%)
[Tokenizing] Line 350,000 / 95,438,835 (0.37%)
[Tokenizing] Line 400,000 / 95,438,835 (0.42%)
[Tokenizing] Line 450,000 / 95,438,835 (0.47%)
[Tokenizing] Line 500,000 / 95,438,835 (0.52%)
[Tokenizing] Line 550,000 / 95,438,835 (0.58%)
[Tokenizing] Line 600,000 / 95,438,835 (0.63%)
[Tokenizing] Line 650,000 / 95,438,835 (0.68%)
[Tokenizing] Line 700,000 / 95,438,835 (0.73%)
[Tokenizing] Line 750,000 / 95,438,835 (0.79%)
[Tokenizing] Line 800,000 / 95,438,835 (0.84%)
[Tokenizing] Line 850,000 / 95,438,835 (0.89%)
[Tokenizing] Line 900,000 / 95,438,835 (0.94%)
[Tokenizing] Line 950,000 / 95,438,835 (1.00%)
[Tokeniz