In [None]:
import re
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np
from pythainlp.tokenize import word_tokenize
import matplotlib.pyplot as plt
from tqdm import tqdm

timestamp_log = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] | {msg}")

INPUT_PATH = "../../raw_data/prachathai/prachathai-67k.csv"
OUTPUT_PATH = "../../artifacts/word_freq/prachathai.json"

In [2]:
def text_cleaning(input_text: str) -> str:
    # Lowercase transform any Latin text
    # text = input_text.lower()

    # Keep Thai characters (https://th.wikipedia.org/wiki/Thai_(บล็อกยูนิโคด))
    # numbers (0-9), periods and comma
    # Replace anything else with whitespace
    text = re.sub(r'([^\u0E01-\u0E3A\u0E40-\u0E4E\u0E50-\u0E590-9.,])', r' ', input_text)

    # Replace Arabic & Thai numbers with a special token
    text = re.sub(r'[\d\u0E50-\u0E59][\d\u0E50-\u0E59.,]*', '<NUM>', text)

    # Collapse multiple consequtive whitespace characters into a single space
    text = re.sub(r'\s+', ' ', text)

    return text

def text_tokenize(input_text: str) -> list[str]:
    return word_tokenize(input_text, engine='newmm', keep_whitespace=False)

def text_count(input_list: list[str]) -> Counter:
    return Counter(input_list)

In [None]:
# Prachathai Corpus
timestamp_log("Reading Prachathai Corpus...")
df = pd.read_csv(
    INPUT_PATH,
    sep=",",
    usecols=["title","body_text"],
    # nrows=1000
    )
timestamp_log(f"Dataframe shape: {df.shape}")

timestamp_log("Text Cleaning...")
df = df.map(text_cleaning)

timestamp_log("Reshaping...")
list_of_sentence = df.values.reshape((1,-1)).squeeze().tolist()
timestamp_log(f"Sentence List size: {len(list_of_sentence)}")

timestamp_log("Text Tokenization & Word Counting...")
list_of_words = list()
word_counter = Counter()
for sentence in tqdm(list_of_sentence, total=len(list_of_sentence)):
    tokens = text_tokenize(sentence)
    list_of_words.append(tokens)
    word_counter.update(tokens)
timestamp_log(f"Word List size: {len(list_of_words)}")

timestamp_log("Filtering Counter...")
filtered_counter = Counter()
for word, counts in tqdm(word_counter.items()):
    # Keep only tokens with any Thai/Latin characters -- removes tokens like '...' and '-'
    if re.search(r'[\u0E01-\u0E3A\u0E40-\u0E4E\u0E50-\u0E59a-zA-Z]', word):
        filtered_counter[word] = counts

timestamp_log("Calculating Frequency...")
word_frequency = dict()
total_counts = filtered_counter.total()
for word, counts in tqdm(filtered_counter.items()):
    word_frequency[word] = counts / total_counts

timestamp_log("Done!")

[10:51:30] | Reading Prachathai Corpus...
[10:51:38] | Dataframe shape: (67889, 2)
[10:51:38] | Text Cleaning...
[10:51:55] | Reshaping...
[10:51:55] | Sentence List size: 135778
[10:51:55] | Text Tokenization & Word Counting...


100%|██████████| 135778/135778 [12:36<00:00, 179.51it/s]


[11:04:31] | Word List size: 135778
[11:04:31] | Filtering Counter...


100%|██████████| 76852/76852 [00:00<00:00, 876552.13it/s]


[11:04:31] | Calculating Frequency...


100%|██████████| 76681/76681 [00:00<00:00, 1518151.47it/s]

[11:04:31] | Done!





In [None]:
import json

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(word_frequency, f, ensure_ascii=False, indent=2)

timestamp_log(f"Saved to: {OUTPUT_PATH}")

[11:04:31] | Saved to: ../output/prachathai_word_frequency.json
