In [None]:
import sys
sys.path.append('../..')
from collections import Counter
from datetime import datetime

import pandas as pd
from tqdm import tqdm

from utils.word_freq import *

timestamp_log = lambda msg: print(f"[{datetime.now().strftime('%H:%M:%S')}] | {msg}")

INPUT_PATHS = [
    "../../raw_data/wongnai/train.csv",
    "../../raw_data/wongnai/test.csv"
    ]
OUTPUT_PATH = "../../artifacts/word_freq/wongnai.json"

In [None]:
# Wongnai Corpus
timestamp_log("Reading Wongnai Corpus...")
temp_dfs = list()

_df_train = pd.read_csv(
    INPUT_PATHS[0],
    sep=";",
    usecols=["review"],
    quotechar='"',
    )
temp_dfs.append(_df_train)

_df_test = pd.read_csv(
    INPUT_PATHS[1],
    sep=";",
    usecols=["review"],
    quotechar='"',
    )
temp_dfs.append(_df_test)

df = pd.concat(temp_dfs)
del temp_dfs
timestamp_log(f"Dataframe shape: {df.shape}")

timestamp_log("Text Cleaning...")
df = df.map(text_cleaning)

timestamp_log("Reshaping...")
list_of_sentence = df.values.squeeze().tolist()
timestamp_log(f"Sentence List size: {len(list_of_sentence)}")

timestamp_log("Text Tokenization & Word Counting...")
list_of_words = list()
word_counter = Counter()
for sentence in tqdm(list_of_sentence, total=len(list_of_sentence)):
    tokens = text_tokenize(sentence)
    list_of_words.append(tokens)
    word_counter.update(tokens)
timestamp_log(f"Word List size: {len(list_of_words)}")

timestamp_log("Filtering Counter...")
filtered_counter = filter_token_counter(word_counter)

timestamp_log("Calculating Frequency...")
word_frequency = normalize_frequency(filtered_counter)

timestamp_log("Done!")

In [None]:
import json

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(word_frequency, f, ensure_ascii=False, indent=2)

timestamp_log(f"Saved to: {OUTPUT_PATH}")