In [None]:
import re
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np
from pythainlp.tokenize import word_tokenize
import matplotlib.pyplot as plt
from tqdm import tqdm

timestamp_log = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] | {msg}")

INPUT_PATHS = [
    "../../raw_data/wisesight/train.jsonl",
    "../../raw_data/wisesight/test.jsonl",
    "../../raw_data/wisesight/valid.jsonl"
    ]
OUTPUT_PATH = "../../artifacts/word_freq/wisesight.json"

In [None]:
def text_cleaning(input_text: str) -> str:
    # Lowercase transform any Latin text
    # text = input_text.lower()

    # Keep Thai characters (https://th.wikipedia.org/wiki/Thai_(บล็อกยูนิโคด))
    # numbers (0-9), periods and comma
    # Replace anything else with whitespace
    text = re.sub(r'([^\u0E01-\u0E3A\u0E40-\u0E4E\u0E50-\u0E590-9.,])', r' ', input_text)

    # Replace Arabic & Thai numbers with a special token
    text = re.sub(r'[\d\u0E50-\u0E59][\d\u0E50-\u0E59.,]*', '<NUM>', text)

    # Collapse multiple consequtive whitespace characters into a single space
    text = re.sub(r'\s+', ' ', text)

    return text

def text_tokenize(input_text: str) -> list[str]:
    return word_tokenize(input_text, engine='newmm', keep_whitespace=False)

def text_count(input_list: list[str]) -> Counter:
    return Counter(input_list)

In [None]:
# Wisesight Corpus
timestamp_log("Reading Wisesight Corpus...")
temp_dfs = list()
for input_file in INPUT_PATHS:
    _df = pd.read_json(input_file, lines=True)
    # Wisesight raw data contains columns: 'review', 'category'
    # We only want the review text for word frequency calculation
    _df.drop(columns=["category"], inplace=True)
    temp_dfs.append(_df)

df = pd.concat(temp_dfs)
del temp_dfs
timestamp_log(f"Dataframe shape: {df.shape}")

timestamp_log("Text Cleaning...")
df = df.map(text_cleaning)

timestamp_log("Reshaping...")
list_of_sentence = df.values.squeeze().tolist()
timestamp_log(f"Sentence List size: {len(list_of_sentence)}")

timestamp_log("Text Tokenization & Word Counting...")
list_of_words = list()
word_counter = Counter()
for sentence in tqdm(list_of_sentence, total=len(list_of_sentence)):
    tokens = text_tokenize(sentence)
    list_of_words.append(tokens)
    word_counter.update(tokens)
timestamp_log(f"Word List size: {len(list_of_words)}")

timestamp_log("Filtering Counter...")
filtered_counter = Counter()
for word, counts in tqdm(word_counter.items()):
    # Keep only tokens with any Thai/Latin characters -- removes tokens like '...' and '-'
    if re.search(r'[\u0E01-\u0E3A\u0E40-\u0E4E\u0E50-\u0E59a-zA-Z]', word):
        filtered_counter[word] = counts

timestamp_log("Calculating Frequency...")
word_frequency = dict()
total_counts = filtered_counter.total()
for word, counts in tqdm(filtered_counter.items()):
    word_frequency[word] = counts / total_counts

timestamp_log("Done!")

In [None]:
import json

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(word_frequency, f, ensure_ascii=False, indent=2)

timestamp_log(f"Saved to: {OUTPUT_PATH}")