In [31]:
import json
import pandas as pd
import jieba
import pickle
import os
from collections import Counter
from pypinyin import pinyin, Style
from deep_translator import GoogleTranslator
import re


In [32]:
# 1. Load Data
with open('../data/raw/z_channel_genting_trip.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
full_text = " ".join([d['text'] for d in data])

In [33]:
# 2. Load Custom Malaysian Dictionary
# Ensure you have 'assets/msia_dict.txt' with terms like RTS and CIQ
if os.path.exists('../assets/msia_dict.txt'):
    jieba.load_userdict('../assets/msia_dict.txt')

In [34]:
# 3. Tokenize & Filter
# Remove stop words and single-character words (grammar particles)
stop_words = {'這個', '這邊', '就是', '一個', '可以', '他們', '因為', '現在', '如果', '一些', '什麼', '所以'}
tokens = [t for t in jieba.lcut(full_text) if len(t) > 1 and t not in stop_words]

In [35]:
# 4. Save for PCA (Notebook 03)
os.makedirs('../data/processed', exist_ok=True)
with open('../data/processed/clean_tokens.pkl', 'wb') as f:
    pickle.dump(tokens, f)

In [36]:
# 5. EDA: Top 10 Phrases
print("Top 10 Phrases:")
print(pd.DataFrame(Counter(tokens).most_common(10), columns=['Word', 'Count']))

Top 10 Phrases:
  Word  Count
0   我們     20
1   雲頂     11
2   快點     10
3   今天     10
4   OK     10
5   番薯      7
6   公公      6
7  Let      6
8   準備      5
9   進去      5


## For studying

In [38]:
def get_pinyin(text):
    # Style.TONE provides marks like ā, á, ǎ, à
    pinyin_list = pinyin(text, style=Style.TONE)
    return " ".join([item[0] for item in pinyin_list])

def count_mandarin(text):
    # This regex looks for the Unicode range of all Chinese characters
    mandarin_only = re.findall(r'[\u4e00-\u9fff]', text)
    return len(mandarin_only)

In [41]:
df = pd.DataFrame(data)

In [42]:
df['mandarin_length'] = df['text'].apply(count_mandarin)
df = df.sort_values(by='mandarin_length', ascending=False)
df = df[df['mandarin_length'] > 0]

In [43]:
# 4. Select the top 30 longest sentences to study
# These are likely to be full explanations or descriptions
study_df = df.head(30).copy()

print(f"⏳ Processing the 30 longest sentences for your study guide...")

⏳ Processing the 30 longest sentences for your study guide...


In [44]:
# Generate Pinyin
study_df['pinyin'] = study_df['text'].apply(get_pinyin)

# Generate English Translation
translator = GoogleTranslator(source='zh-CN', target='en')
study_df['english'] = study_df['text'].apply(lambda x: translator.translate(x))

In [45]:
study_df

Unnamed: 0,text,start_raw,start_formatted,duration_raw,duration_formatted,mandarin_length,pinyin,english
338,我們今天就以這樣的方式來結尾啦,937.966,00:15:37:966,2.334,00:00:02:334,15,wǒ men jīn tiān jiù yǐ zhè yàng de fāng shì lá...,This is how we end today
296,等一下我們玩的就是這個設施啦,606.166,00:10:06:166,2.0,00:00:02:000,14,děng yí xià wǒ men wán de jiù shì zhè gè shè s...,"Wait a minute, this is the facility we are goi..."
315,還有她的兩個兒子跟我的小番薯,682.166,00:11:22:166,2.867,00:00:02:867,14,hái yǒu tā de liǎng gè ér zi gēn wǒ de xiǎo fā...,And her two sons and my little sweet potato
45,歡迎來到Z頻道的大馬番薯日記,81.366,00:01:21:365,1.234,00:00:01:234,13,huān yíng lái dào Z pín dào de dà mǎ fān shǔ r...,Welcome to Channel Z’s Malaysian Sweet Potato ...
209,誒，那個，他沒有放垃圾袋的做麼？,389.4,00:06:29:399,1.7,00:00:01:700,13,éi ， nà gè ， tā méi yǒu fàng lā jī dài de zuò ...,"Hey, um, didn't he put a garbage bag in it?"
300,等下這個表哥表妹就要上去了,632.033,00:10:32:033,2.067,00:00:02:067,13,děng xià zhè gè biǎo gē biǎo mèi jiù yào shàng...,"Wait a minute, this cousin is going to go up."
317,噢它是類似3D的那種電影的感覺,704.066,00:11:44:066,3.634,00:00:03:633,13,ō tā shì lèi shì 3D de nà zhǒng diàn yǐng de g...,"Oh, it feels like a 3D movie"
347,小番薯，你講我們下一部電影見,962.5,00:16:02:500,2.233,00:00:02:233,13,xiǎo fān shǔ ， nǐ jiǎng wǒ men xià yī bù diàn ...,"Xiaofanshu, tell me we’ll see you in the next ..."
343,所以你看我們的頭有點濕濕的,948.466,00:15:48:466,1.9,00:00:01:899,13,suǒ yǐ nǐ kàn wǒ men de tóu yǒu diǎn shī shī de,So you see our heads are a little wet
255,想到要跟這些小瓜一整天哦,476.933,00:07:56:932,2.533,00:00:02:532,12,xiǎng dào yào gēn zhè xiē xiǎo guā yī zhěng ti...,I thought about spending the whole day with th...


In [48]:
OUTPUT_FILE = '../data/processed/genting_long_sentences_study.csv'
# Adding 'length' to the output so you can see how "big" the sentence is
final_columns = ['start_formatted', 'mandarin_length', 'text', 'pinyin', 'english']
study_df[final_columns].to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')