In [26]:
import json
import pandas as pd
import jieba
import pickle
import os
from collections import Counter
from pypinyin import pinyin, Style
from deep_translator import GoogleTranslator

In [14]:
# 1. Load Data
with open('../data/raw/z_channel_genting_trip.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
full_text = " ".join([d['text'] for d in data])

In [15]:
# 2. Load Custom Malaysian Dictionary
# Ensure you have 'assets/msia_dict.txt' with terms like RTS and CIQ
if os.path.exists('../assets/msia_dict.txt'):
    jieba.load_userdict('../assets/msia_dict.txt')

In [16]:
# 3. Tokenize & Filter
# Remove stop words and single-character words (grammar particles)
stop_words = {'這個', '這邊', '就是', '一個', '可以', '他們', '因為', '現在', '如果', '一些', '什麼', '所以'}
tokens = [t for t in jieba.lcut(full_text) if len(t) > 1 and t not in stop_words]

In [17]:
# 4. Save for PCA (Notebook 03)
os.makedirs('../data/processed', exist_ok=True)
with open('../data/processed/clean_tokens.pkl', 'wb') as f:
    pickle.dump(tokens, f)

In [18]:
# 5. EDA: Top 10 Phrases
print("Top 10 Phrases:")
print(pd.DataFrame(Counter(tokens).most_common(10), columns=['Word', 'Count']))

Top 10 Phrases:
  Word  Count
0   我們     20
1   雲頂     11
2   快點     10
3   今天     10
4   OK     10
5   番薯      7
6   公公      6
7  Let      6
8   準備      5
9   進去      5


## For studying

In [19]:
df = pd.DataFrame(data)

In [22]:
df['length'] = df['text'].str.len()

df = df.sort_values(by='length',ascending=False)

In [27]:
def get_pinyin(text):
    # Style.TONE provides marks like ā, á, ǎ, à
    pinyin_list = pinyin(text, style=Style.TONE)
    return " ".join([item[0] for item in pinyin_list])

In [24]:
# 4. Select the top 30 longest sentences to study
# These are likely to be full explanations or descriptions
study_df = df.head(30).copy()

print(f"⏳ Processing the 30 longest sentences for your study guide...")

⏳ Processing the 30 longest sentences for your study guide...


In [28]:
# Generate Pinyin
study_df['pinyin'] = study_df['text'].apply(get_pinyin)

# Generate English Translation
translator = GoogleTranslator(source='zh-CN', target='en')
study_df['english'] = study_df['text'].apply(lambda x: translator.translate(x))

In [29]:
study_df

Unnamed: 0,text,start_raw,start_formatted,duration_raw,duration_formatted,length,pinyin,english
194,嗎？這個是叫family suites嗎？,367.5,00:06:07:500,1.6,00:00:01:600,21,ma ？ zhè gè shì jiào family suites ma ？,? Is this called family suites?
120,那其實我們這次呢是family trip,239.933,00:03:59:932,2.867,00:00:02:867,20,nà qí shí wǒ men zhè cì ne shì family trip,So actually our trip this time is a family trip
292,不用online reservation,594.933,00:09:54:932,1.833,00:00:01:833,20,bù yòng online reservation,No online reservation required
193,這個是family suites好像,365.7,00:06:05:699,1.8,00:00:01:800,18,zhè gè shì family suites hǎo xiàng,This seems to be family suites
111,然後現在我妹妹還在做checking,221.166,00:03:41:165,2.467,00:00:02:467,18,rán hòu xiàn zài wǒ mèi mèi hái zài zuò checking,And now my sister is still doing checking
227,這個Skyworld Hotel的,419.266,00:06:59:266,2.267,00:00:02:266,17,zhè gè Skyworld Hotel de,This Skyworld Hotel
98,誒你不要看浪費我的Memory咧,204.133,00:03:24:133,2.533,00:00:02:532,16,éi nǐ bú yào kàn làng fèi wǒ de Memory liě,"Hey, don’t waste my memory by looking at it."
209,誒，那個，他沒有放垃圾袋的做麼？,389.4,00:06:29:399,1.7,00:00:01:700,16,éi ， nà gè ， tā méi yǒu fàng lā jī dài de zuò ...,"Hey, um, didn't he put a garbage bag in it?"
338,我們今天就以這樣的方式來結尾啦,937.966,00:15:37:966,2.334,00:00:02:334,15,wǒ men jīn tiān jiù yǐ zhè yàng de fāng shì lá...,This is how we end today
317,噢它是類似3D的那種電影的感覺,704.066,00:11:44:066,3.634,00:00:03:633,15,ō tā shì lèi shì 3D de nà zhǒng diàn yǐng de g...,"Oh, it feels like a 3D movie"


In [30]:
OUTPUT_FILE = '../data/processed/genting_long_sentences_study.csv'
# Adding 'length' to the output so you can see how "big" the sentence is
final_columns = ['start_formatted', 'length', 'text', 'pinyin', 'english']
study_df[final_columns].to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')