In [1]:
import json
import pandas as pd
import jieba
import pickle
import os
from collections import Counter

In [2]:
# 1. Load Data
with open('../data/raw/z_channel_apartment.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
full_text = " ".join([d['text'] for d in data])

In [3]:
# 2. Load Custom Malaysian Dictionary
# Ensure you have 'assets/msia_dict.txt' with terms like RTS and CIQ
if os.path.exists('../assets/msia_dict.txt'):
    jieba.load_userdict('../assets/msia_dict.txt')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\MUZAFA~1.AZM\AppData\Local\Temp\jieba.cache
Loading model cost 0.509 seconds.
Prefix dict has been built successfully.


In [4]:
# 3. Tokenize & Filter
# Remove stop words and single-character words (grammar particles)
stop_words = {'這個', '這邊', '就是', '一個', '可以', '他們', '因為', '現在', '如果', '一些', '什麼', '所以'}
tokens = [t for t in jieba.lcut(full_text) if len(t) > 1 and t not in stop_words]

In [5]:
# 4. Save for PCA (Notebook 03)
os.makedirs('../data/processed', exist_ok=True)
with open('../data/processed/clean_tokens.pkl', 'wb') as f:
    pickle.dump(tokens, f)

In [6]:
# 5. EDA: Top 10 Phrases
print("Top 10 Phrases:")
print(pd.DataFrame(Counter(tokens).most_common(10), columns=['Word', 'Count']))

Top 10 Phrases:
      Word  Count
0      RTS     16
1      CIQ     16
2  project     13
3       連橋      9
4       展商      9
5    Block      9
6       地段      8
7       靠近      8
8       真的      7
9       一直      7
