In [None]:
import re
from collections import Counter
from functools import reduce
from datetime import datetime

import pandas as pd
import numpy as np
from pythainlp.tokenize import word_tokenize
import matplotlib.pyplot as plt

timestamp_log = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] | {msg}")

In [None]:
def text_cleaning(input_text: str) -> str:
    # Keep Thai characters (ก-ฮ, vowels, tone marks), basic Latin (a-z, A-Z), numbers (0-9),
    # and the important Thai symbols ๆ, ฯ.
    # The pattern [^\u0E00-\u0E7Fa-zA-Z0-9\s] matches anything that is NOT in the allowed set.
    # We add a space around these to ensure words don't merge.
    text = re.sub(r'([^\u0E00-\u0E7Fa-zA-Z0-9.,])', r' ', input_text)

    # Collapse multiple whitespace characters into a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def text_tokenize(input_text: str) -> list[str]:
    return word_tokenize(input_text, engine='newmm', keep_whitespace=False)

def text_count(input_list: list[str]) -> Counter:
    return Counter(input_list)

In [None]:
# Prachathai Corpus
timestamp_log("Reading Prachathai Corpus...")
df = pd.read_csv(
    "../data/prachathai-67k/prachathai-67k.csv",
    sep=",",
    usecols=["title","body_text"],
    nrows=1000
    )

timestamp_log("Text Cleaning...")
df = df.map(text_cleaning)
timestamp_log("Text Tokenization...")
df = df.map(text_tokenize)
timestamp_log("Text Counting...")
df = df.map(text_count)

timestamp_log("Formatting Output...")
output = df.values.reshape((1,-1)).squeeze().tolist()
combined_counter = reduce(lambda x, y: x + y, output)

timestamp_log("Calculating Frequency...")
word_freq = dict()
total_counts = combined_counter.total()
for word, counts in combined_counter.items():
    word_freq[word] = counts / total_counts

timestamp_log("Done!")

In [None]:
word_freq