In [7]:
import json
import pandas as pd
import jieba
import pickle
import os
from collections import Counter

In [8]:
# 1. Load Data
with open('../data/raw/z_channel_genting_trip.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
full_text = " ".join([d['text'] for d in data])

In [9]:
# 2. Load Custom Malaysian Dictionary
# Ensure you have 'assets/msia_dict.txt' with terms like RTS and CIQ
if os.path.exists('../assets/msia_dict.txt'):
    jieba.load_userdict('../assets/msia_dict.txt')

In [10]:
# 3. Tokenize & Filter
# Remove stop words and single-character words (grammar particles)
stop_words = {'這個', '這邊', '就是', '一個', '可以', '他們', '因為', '現在', '如果', '一些', '什麼', '所以'}
tokens = [t for t in jieba.lcut(full_text) if len(t) > 1 and t not in stop_words]

In [11]:
# 4. Save for PCA (Notebook 03)
os.makedirs('../data/processed', exist_ok=True)
with open('../data/processed/clean_tokens.pkl', 'wb') as f:
    pickle.dump(tokens, f)

In [12]:
# 5. EDA: Top 10 Phrases
print("Top 10 Phrases:")
print(pd.DataFrame(Counter(tokens).most_common(10), columns=['Word', 'Count']))

Top 10 Phrases:
  Word  Count
0   我們     20
1   雲頂     11
2   快點     10
3   今天     10
4   OK     10
5   番薯      7
6   公公      6
7  Let      6
8   準備      5
9   進去      5
