In [1]:
import sys
sys.path.append('../..')
import json
import re
from collections import Counter
from datetime import datetime
from xml.etree.ElementTree import iterparse

from tqdm import tqdm

from utils.word_freq import *

timestamp_log = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] | {msg}")

INPUT_PATH = "../../raw_data/thwiki/thwiki-latest-pages-articles.xml"
OUTPUT_PATH = "../../artifacts/word_freq/thwiki.json"

In [2]:
# Thai Wikipedia
timestamp_log("Processing Thai Wikipedia Corpus...")

# Get total number of pages for tqdm progress bar
total_elements = 0
for event, elem in iterparse(INPUT_PATH, events=('end',)):
    total_elements += 1
    # Clear the element to free up memory
    elem.clear()

timestamp_log(f"Total number of items to process: {total_elements}")

word_counter = Counter()
# Use iterparse to stream-process the large XML file
# This is to avoid loading the entire 3GB file into memory
for event, elem in tqdm(iterparse(INPUT_PATH, events=('end',)), total=total_elements):
    # Article contents are enclosed in a <text> tag
    if elem.tag.endswith('text'):
        if elem.text is not None:
            # 1. Text Cleaning
            cleaned_text = text_cleaning(elem.text)
            # 2. Text Tokenization
            tokens = text_tokenize(cleaned_text)
            # 3. Word Counting
            word_counter.update(tokens)
        
        # Clear the element to free up memory
        elem.clear()

timestamp_log(f"Finished processing the corpus.")

[18:59:53] | Processing Thai Wikipedia Corpus...
[19:00:24] | Total number of items to process: 9663547


100%|██████████| 9663547/9663547 [23:49<00:00, 6761.21it/s]  

[19:24:13] | Finished processing the corpus.





In [3]:
timestamp_log("Filtering Counter...")
filtered_counter = Counter()
for word, counts in tqdm(word_counter.items()):
    # Keep only tokens with any Thai/Latin characters -- removes tokens like '...' and '-'
    if re.search(r'[\u0E01-\u0E3A\u0E40-\u0E4E\u0E50-\u0E59a-zA-Z]', word):
        filtered_counter[word] = counts

timestamp_log("Calculating Frequency...")
word_frequency = dict()
total_counts = filtered_counter.total()
for word, counts in tqdm(filtered_counter.items()):
    word_frequency[word] = counts / total_counts

timestamp_log("Done!")

[19:24:13] | Filtering Counter...


100%|██████████| 205484/205484 [00:00<00:00, 635280.16it/s]


[19:24:13] | Calculating Frequency...


100%|██████████| 205442/205442 [00:00<00:00, 878518.62it/s]

[19:24:14] | Done!





In [4]:
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(word_frequency, f, ensure_ascii=False, indent=2)

timestamp_log(f"Saved to: {OUTPUT_PATH}")

[19:24:14] | Saved to: ../../artifacts/word_freq/thwiki.json
