In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
import re
from pathlib import Path

def extract_video_id(url):
    """Extract video ID from YouTube URL"""
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'^([0-9A-Za-z_-]{11})$'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_youtube_transcript(url, output_file=None):
    """
    Extract transcript from YouTube video and optionally save to file
    
    Parameters:
    -----------
    url : str
        YouTube video URL or video ID
    output_file : str, optional
        Path to save transcript. If None, transcript is only returned
        
    Returns:
    --------
    str
        The full transcript text
    """
    # Extract video ID from URL
    video_id = extract_video_id(url)
    if not video_id:
        raise ValueError(f"Could not extract video ID from URL: {url}")
    
    print(f"Extracting transcript for video ID: {video_id}")
    
    # Get transcript using the new API
    try:
        api = YouTubeTranscriptApi()
        transcript = api.fetch(video_id, languages=['en'])
    except Exception as e:
        print(f"Error getting transcript: {e}")
        raise
    
    # Combine all text segments (using .text attribute)
    full_transcript = " ".join([entry.text for entry in transcript])
    
    # Save to file if output_file is specified
    if output_file:
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(f"YouTube Video ID: {video_id}\n")
            f.write(f"URL: https://www.youtube.com/watch?v={video_id}\n")
            f.write("="*80 + "\n\n")
            f.write(full_transcript)
        
        print(f"Transcript saved to: {output_path}")
    
    return full_transcript

In [3]:
# Test with sample video
video_url = "https://www.youtube.com/watch?v=MSqk6mbkQDU"
output_file = "../data/youtube_transcript_MSqk6mbkQDU.txt"

transcript = get_youtube_transcript(video_url, output_file=output_file)

print(f"\nTranscript length: {len(transcript)} characters")
print(f"\nFirst 500 characters:\n{transcript[:500]}...")

Extracting transcript for video ID: MSqk6mbkQDU
Transcript saved to: ../data/youtube_transcript_MSqk6mbkQDU.txt

Transcript length: 19559 characters

First 500 characters:
Everyone is hating on the Schwab US Dividend Equity ETF right now because if you look at its performance over the past year, over the past three years, it has really lagged behind not only the broader S&P 500 and NASDAQ, but actually the broader dividend growth investment universe. There are quite a few funds like VIG, DGRO, DGRW, even VIM that have outperformed SED in recent years. And of course also the covered call ETFs, JAPQ, JAPI, SPYI, QQQI, GPIQ, GPIX and so on and so forth. Those funds a...


In [None]:
# Test import first
try:
    from youtube_transcript_api import YouTubeTranscriptApi
    print("Import successful!")
except Exception as e:
    print(f"Import error: {e}")