In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter, JSONFormatter
import json

def get_youtube_transcript(video_id, languages=['en'], fallback=True):
    """
    Retrieve YouTube transcript (auto-generated or manual)
    
    Args:
        video_id: YouTube video ID (11-character string)
        languages: Preferred language codes (default: ['en'])
        fallback: If True, will return auto-generated if manual not available
    
    Returns:
        dict: {'success': bool, 'transcript': str/list, 'type': str, 'language': str}
    """
    try:
        # Try to get manual transcript first
        transcript = YouTubeTranscriptApi.get_transcript(
            video_id, 
            languages=languages
        )
        transcript_type = "manual"
        
    except Exception as manual_error:
        if not fallback:
            return {
                'success': False,
                'error': str(manual_error),
                'type': 'none'
            }
            
        try:
            # Get list of available transcripts
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            
            # Find auto-generated transcript in preferred language
            transcript = None
            for lang in languages:
                try:
                    transcript = transcript_list.find_generated_transcript([lang])
                    break
                except:
                    continue
            
            # If not found, get first available auto-generated transcript
            if not transcript:
                for t in transcript_list:
                    if t.is_generated:
                        transcript = t
                        break
                
            if not transcript:
                raise Exception("No auto-generated transcripts available")
            
            transcript = transcript.fetch()
            transcript_type = "auto-generated"
            
        except Exception as auto_error:
            return {
                'success': False,
                'error': str(auto_error),
                'type': 'none'
            }

    return {
        'success': True,
        'transcript': transcript,
        'type': transcript_type,
        'language': transcript[0].get('language', languages[0]) if transcript else ''
    }

def save_transcript(transcript_data, filename, format='text'):
    """Save transcript to file"""
    if not transcript_data['success']:
        raise ValueError("No transcript available to save")
    
    if format == 'text':
        formatter = TextFormatter()
        text = formatter.format_transcript(transcript_data['transcript'])
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(text)
            
    elif format == 'json':
        formatter = JSONFormatter()
        json_data = formatter.format_transcript(transcript_data['transcript'])
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(json_data)
            
    elif format == 'srt':
        from youtube_transcript_api.formatters import SRTFormatter
        formatter = SRTFormatter()
        srt_data = formatter.format_transcript(transcript_data['transcript'])
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(srt_data)
            
    return f"Transcript saved as {filename} ({format.upper()})"

# Example usage
if __name__ == "__main__":
    VIDEO_ID = "As5c9cI0t6Q&t"  # Replace with your video ID
    
    # Get transcript (prefer English, fallback to auto-generated)
    result = get_youtube_transcript(VIDEO_ID, languages=['en', 'vi'])
    
    if result['success']:
        print(f"Found {result['type']} transcript in {result['language']}")
        
        # Save in multiple formats
        print(save_transcript(result, f"{VIDEO_ID}_transcript.txt", format='text'))
        print(save_transcript(result, f"{VIDEO_ID}_transcript.json", format='json'))
        print(save_transcript(result, f"{VIDEO_ID}_transcript.srt", format='srt'))
    else:
        print(f"Error: {result['error']}")

Error: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=As5c9cI0t6Q&t! This is most likely caused by:

The video is no longer available

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!


In [2]:
import os
import json
from datetime import datetime
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

def get_youtube_service(api_key):
    """Create YouTube API service instance"""
    return build('youtube', 'v3', developerKey=api_key)

def search_videos(youtube, keyword, max_results=5):
    """Search YouTube videos and return top results"""
    request = youtube.search().list(
        part="id,snippet",
        q=keyword,
        type="video",
        maxResults=max_results,
        order="relevance"
    )
    response = request.execute()
    
    videos = []
    for item in response['items']:
        videos.append({
            'video_id': item['id']['videoId'],
            'title': item['snippet']['title'],
        })
    return videos

def get_video_comments(youtube, video_id, video_title, max_comments=40):
    """Fetch comments for a single video"""
    comments = []
    next_page_token = None
    comment_count = 0
    
    try:
        while comment_count < max_comments:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=min(100, max_comments - comment_count),
                pageToken=next_page_token,
                textFormat="plainText"
            )
            response = request.execute()
            
            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                content = comment['textDisplay']
                if content.strip():
                    comments.append({
                        'video_title': video_title,
                        'content': content
                    })
                    comment_count += 1
                    if comment_count >= max_comments:
                        break
            
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
                
    except HttpError as e:
        print(f"Error getting comments for {video_id}: {e}")
    
    return comments

def crawl_youtube_comments(api_key, keyword, max_videos=5, max_comments=40):
    """Main function to crawl comments for search results"""
    youtube = get_youtube_service(api_key)
    
    print(f"Searching for videos with keyword: '{keyword}'")
    videos = search_videos(youtube, keyword, max_results=max_videos)
    print(f"Found {len(videos)} videos")
    
    all_comments = []
    for i, video in enumerate(videos, 1):
        print(f"\nProcessing video {i}/{len(videos)}: {video['title']}")
        comments = get_video_comments(youtube, video['video_id'], video['title'], max_comments=max_comments)
        print(f"Retrieved {len(comments)} comments")
        all_comments.extend(comments)
        
    return all_comments

def save_results(comments, keyword):
    """Save results to JSON file"""
    if not comments:
        print("No comments to save!")
        return None
        
    # Create filename-safe keyword
    safe_keyword = ''.join(c if c.isalnum() else '_' for c in keyword)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"youtube_{safe_keyword}_{timestamp}.json"
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(comments, f, ensure_ascii=False, indent=2)
    
    print(f"\nSaved {len(comments)} comments to {filename}")
    return filename

if __name__ == "__main__":
    # Configuration
    API_KEY = "AIzaSyAk9Ijx1Cg-eNsi3KPJHHjjT_H5zrbdEPo"  # Replace with your YouTube API key
    
    # Get user input
    keyword = input("Enter search keyword: ").strip()
    max_videos = int(input("How many top videos to fetch? (e.g. 5): ").strip() or "5")
    max_comments = int(input("How many comments per video? (e.g. 40): ").strip() or "40")
    
    # Execute crawl
    comments = crawl_youtube_comments(API_KEY, keyword, max_videos=max_videos, max_comments=max_comments)
    
    # Save results
    if comments:
        save_results(comments, keyword)
    else:
        print("No comments were retrieved.")

Searching for videos with keyword: 'iphone'
Found 3 videos

Processing video 1/3: iPhone 16 vs. iPhone 11
Retrieved 20 comments

Processing video 2/3: iPhone 17 Ultra - First Look!
Retrieved 20 comments

Processing video 3/3: iPhone 17 - 10 MAJOR Updates!
Retrieved 20 comments

Saved 60 comments to youtube_iphone_20250630_201517.json
