In [1]:
import os
import json
from youtube_transcript_api import YouTubeTranscriptApi

In [2]:
# Configuration
VIDEO_ID = 'SWetuT171NQ'
OUTPUT_PATH = '../data/raw/z_channel_sg_vs_msia.json'

def format_time(seconds):
    """Converts float seconds to hh:mm:ss:ms string."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d}:{millis:03d}"

try:
    # 1. Fetch transcript
    api = YouTubeTranscriptApi()
    fetched_objects = api.fetch(VIDEO_ID, languages=['zh-HK', 'zh-TW', 'zh', 'en'])
    
    # 2. Convert and format
    serializable_data = []
    for entry in fetched_objects:
        serializable_data.append({
            'text': entry.text,
            'start_raw': entry.start,             # Keep raw for calculations
            'start_formatted': format_time(entry.start),
            'duration_raw': entry.duration,       # Keep raw for calculations
            'duration_formatted': format_time(entry.duration)
        })
    
    # 3. Save
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        json.dump(serializable_data, f, ensure_ascii=False, indent=4)
        
    print(f"✅ Successfully saved {len(serializable_data)} lines with formatted timestamps!")

except Exception as e:
    print(f"❌ Extraction failed: {e}")

✅ Successfully saved 378 lines with formatted timestamps!
