In [None]:
import sys
sys.path.append('../..')
import json
import re
from collections import Counter
from datetime import datetime
from xml.etree.ElementTree import iterparse

from tqdm import tqdm

from utils.word_freq import *

timestamp_log = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] | {msg}")

INPUT_PATH = "../../raw_data/thwiki/thwiki-latest-pages-articles.xml"
OUTPUT_PATH = "../../artifacts/word_freq/thwiki.json"

In [None]:
# Thai Wikipedia
timestamp_log("Processing Thai Wikipedia Corpus...")

# Get total number of pages for tqdm progress bar
total_elements = 0
for event, elem in iterparse(INPUT_PATH, events=('end',)):
    total_elements += 1
    # Clear the element to free up memory
    elem.clear()

timestamp_log(f"Total number of items to process: {total_elements}")

word_counter = Counter()
# Use iterparse to stream-process the large XML file
# This is to avoid loading the entire 3GB file into memory
for event, elem in tqdm(iterparse(INPUT_PATH, events=('end',)), total=total_elements):
    # Article contents are enclosed in a <text> tag
    if elem.tag.endswith('text'):
        if elem.text is not None:
            # 1. Text Cleaning
            cleaned_text = text_cleaning(elem.text)
            # 2. Text Tokenization
            tokens = text_tokenize(cleaned_text)
            # 3. Word Counting
            word_counter.update(tokens)
        
        # Clear the element to free up memory
        elem.clear()

timestamp_log(f"Finished processing the corpus.")

In [None]:
timestamp_log("Filtering Counter...")
filtered_counter = filter_token_counter(word_counter)

timestamp_log("Calculating Frequency...")
word_frequency = normalize_frequency(filtered_counter)

timestamp_log("Done!")

In [None]:
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(word_frequency, f, ensure_ascii=False, indent=2)

timestamp_log(f"Saved to: {OUTPUT_PATH}")