In [None]:
import sys
sys.path.append('../..')
from collections import Counter
from datetime import datetime

import pandas as pd
from tqdm import tqdm

from utils.word_freq import *

timestamp_log = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] | {msg}")

INPUT_PATH = "../../raw_data/prachathai/prachathai-67k.csv"
OUTPUT_PATH = "../../artifacts/word_freq/prachathai.json"

In [None]:
# Prachathai Corpus
timestamp_log("Reading Prachathai Corpus...")
df = pd.read_csv(
    INPUT_PATH,
    sep=",",
    usecols=["title","body_text"],
    )
timestamp_log(f"Dataframe shape: {df.shape}")

timestamp_log("Text Cleaning...")
df = df.map(text_cleaning)

timestamp_log("Reshaping...")
list_of_sentence = df.values.reshape((1,-1)).squeeze().tolist()
timestamp_log(f"Sentence List size: {len(list_of_sentence)}")

timestamp_log("Text Tokenization & Word Counting...")
list_of_words = list()
word_counter = Counter()
for sentence in tqdm(list_of_sentence, total=len(list_of_sentence)):
    tokens = text_tokenize(sentence)
    list_of_words.append(tokens)
    word_counter.update(tokens)
timestamp_log(f"Word List size: {len(list_of_words)}")

timestamp_log("Filtering Counter...")
filtered_counter = Counter()
for word, counts in tqdm(word_counter.items()):
    # Keep only tokens with any Thai/Latin characters -- removes tokens like '...' and '-'
    if re.search(r'[\u0E01-\u0E3A\u0E40-\u0E4E\u0E50-\u0E59a-zA-Z]', word):
        filtered_counter[word] = counts

timestamp_log("Calculating Frequency...")
word_frequency = dict()
total_counts = filtered_counter.total()
for word, counts in tqdm(filtered_counter.items()):
    word_frequency[word] = counts / total_counts

timestamp_log("Done!")

In [None]:
import json

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(word_frequency, f, ensure_ascii=False, indent=2)

timestamp_log(f"Saved to: {OUTPUT_PATH}")