# YouTube Subtitle Generator - Google Colab

This notebook allows you to generate subtitles from YouTube videos using OpenAI's Whisper model in Google Colab environment.

## Features
- Single video processing
- Batch processing for multiple videos/podcasts
- Export results in multiple formats (TXT, CSV, JSON)
- Automatic file downloads

## Setup and Installation

Run the following cells to install required dependencies:

In [None]:
# Install required packages
!pip install yt-dlp==2025.7.21
!pip install openai-whisper==20250625

# Install FFmpeg (required for audio processing)
!apt update && apt install -y ffmpeg

print("Installation completed!")

In [None]:
# Import required libraries
import os
import logging
import json
import csv
import pandas as pd
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Optional, Dict, Any, List
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, HTML, FileLink
import zipfile

import yt_dlp
try:
    import whisper
except AttributeError:
    print("Error: Wrong whisper library installed.")
    print("Please run: pip uninstall whisper && pip install openai-whisper")
    raise

print("All imports successful!")

## Configuration

Set up the configuration for subtitle generation:

In [None]:
# Configuration
class WhisperModel(Enum):
    TINY = "tiny"
    BASE = "base"
    SMALL = "small"
    MEDIUM = "medium"
    LARGE = "large"

@dataclass
class AppConfig:
    """Application configuration settings."""
    output_directory: Path = Path("/content/youtube_subtitles")
    default_model: WhisperModel = WhisperModel.BASE
    audio_format: str = "mp3"
    audio_quality: str = "192"
    max_filename_length: int = 100

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Create configuration
config = AppConfig()
config.output_directory.mkdir(exist_ok=True)

print(f"Configuration set up. Output directory: {config.output_directory}")

In [None]:
# Core classes (same as main script but adapted for Colab)
class YouTubeAudioExtractor:
    """Handles downloading audio from YouTube videos."""
    
    def __init__(self, config: AppConfig):
        self.config = config
        
    def get_video_info(self, video_url: str) -> Dict[str, Any]:
        """Retrieves video information without downloading."""
        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'extract_flat': True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            return ydl.extract_info(video_url, download=False)
    
    def create_safe_filename(self, video_title: str) -> str:
        """Creates a safe filename from video title."""
        safe_chars = (char for char in video_title if char.isalnum() or char in (' ', '-', '_'))
        safe_name = "".join(safe_chars).replace(' ', '_')
        return safe_name[:self.config.max_filename_length]
    
    def download_audio(self, video_url: str, output_path: Path) -> Path:
        """Downloads audio from YouTube video."""
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': self.config.audio_format,
                'preferredquality': self.config.audio_quality,
            }],
            'outtmpl': str(output_path),
            'quiet': True,
        }
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
        
        return output_path.with_suffix(f".{self.config.audio_format}")

class BatchSubtitleGenerator:
    """Enhanced subtitle generator with batch processing capabilities."""
    
    def __init__(self, config: AppConfig):
        self.config = config
        self.model = whisper.load_model(config.default_model.value)
        self.youtube_extractor = YouTubeAudioExtractor(config)
        self.config.output_directory.mkdir(exist_ok=True)
        
    def validate_youtube_url(self, url: str) -> bool:
        """Validates if the provided URL is a YouTube link."""
        return any(domain in url for domain in ["youtube.com", "youtu.be"])
        
    def generate_single_subtitle(self, video_url: str) -> Optional[Dict[str, Any]]:
        """Generates subtitles for a single YouTube video."""
        try:
            if not self.validate_youtube_url(video_url):
                logging.error(f"Invalid YouTube URL: {video_url}")
                return None
                
            video_info = self.youtube_extractor.get_video_info(video_url)
            safe_filename = self.youtube_extractor.create_safe_filename(video_info['title'])
            
            audio_path = self.config.output_directory / safe_filename
            text_path = audio_path.with_suffix('.txt')
            
            logging.info(f"Processing: {video_info['title']}")
            final_audio_path = self.youtube_extractor.download_audio(video_url, audio_path)
            
            if not final_audio_path.exists():
                raise FileNotFoundError(f"Audio file not found at: {final_audio_path}")
            
            result = self.model.transcribe(str(final_audio_path))
            
            # Clean up audio file
            final_audio_path.unlink()
            
            # Save text file
            text_path.write_text(result["text"], encoding='utf-8')
            
            return {
                'url': video_url,
                'title': video_info['title'],
                'transcript': result["text"],
                'filename': safe_filename,
                'text_file': str(text_path),
                'duration': video_info.get('duration', 'Unknown'),
                'processed_at': datetime.now().isoformat()
            }
            
        except Exception as e:
            logging.error(f"Failed to generate subtitles for {video_url}: {str(e)}")
            if 'final_audio_path' in locals() and final_audio_path.exists():
                final_audio_path.unlink()
            return None
    
    def generate_batch_subtitles(self, video_urls: List[str]) -> List[Dict[str, Any]]:
        """Generates subtitles for multiple YouTube videos."""
        results = []
        
        for i, url in enumerate(video_urls, 1):
            print(f"\nProcessing video {i}/{len(video_urls)}...")
            result = self.generate_single_subtitle(url.strip())
            if result:
                results.append(result)
                print(f"✓ Completed: {result['title'][:50]}...")
            else:
                print(f"✗ Failed to process: {url}")
                
        return results
    
    def save_results_csv(self, results: List[Dict[str, Any]], filename: str = "subtitles_results.csv") -> Path:
        """Saves results to CSV file."""
        csv_path = self.config.output_directory / filename
        
        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            if results:
                fieldnames = ['title', 'url', 'duration', 'transcript', 'filename', 'processed_at']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                for result in results:
                    writer.writerow({
                        'title': result['title'],
                        'url': result['url'],
                        'duration': result['duration'],
                        'transcript': result['transcript'],
                        'filename': result['filename'],
                        'processed_at': result['processed_at']
                    })
        
        return csv_path
    
    def save_results_json(self, results: List[Dict[str, Any]], filename: str = "subtitles_results.json") -> Path:
        """Saves results to JSON file."""
        json_path = self.config.output_directory / filename
        
        with open(json_path, 'w', encoding='utf-8') as jsonfile:
            json.dump(results, jsonfile, indent=2, ensure_ascii=False)
        
        return json_path

print("Classes defined successfully!")

## Single Video Processing

Process a single YouTube video:

In [None]:
# Single video processing
generator = BatchSubtitleGenerator(config)

# Input widget for video URL
video_url_widget = widgets.Text(
    value='',
    placeholder='Enter YouTube video URL',
    description='Video URL:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px')
)

process_button = widgets.Button(
    description='Generate Subtitles',
    button_style='primary',
    layout=widgets.Layout(width='200px')
)

output_widget = widgets.Output()

def process_single_video(button):
    with output_widget:
        output_widget.clear_output()
        
        video_url = video_url_widget.value.strip()
        if not video_url:
            print("Please enter a YouTube video URL")
            return
        
        print("Processing video...")
        result = generator.generate_single_subtitle(video_url)
        
        if result:
            print(f"\n✓ Successfully processed: {result['title']}")
            print(f"📁 Text file saved: {result['text_file']}")
            print(f"\n📝 First 500 characters of transcript:")
            print(f"{result['transcript'][:500]}...")
            
            # Provide download link
            display(HTML(f'<a href="{result["text_file"]}" download>Download transcript file</a>'))
        else:
            print("❌ Failed to process video")

process_button.on_click(process_single_video)

display(video_url_widget)
display(process_button)
display(output_widget)

## Batch Processing

Process multiple YouTube videos at once:

In [None]:
# Batch processing
batch_urls_widget = widgets.Textarea(
    value='',
    placeholder='Enter YouTube URLs (one per line)',
    description='Video URLs:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px', height='150px')
)

output_format_widget = widgets.Dropdown(
    options=[('Text files only', 'txt'), ('CSV file', 'csv'), ('JSON file', 'json'), ('All formats', 'all')],
    value='all',
    description='Output format:',
    style={'description_width': 'initial'}
)

batch_process_button = widgets.Button(
    description='Process Batch',
    button_style='success',
    layout=widgets.Layout(width='200px')
)

batch_output_widget = widgets.Output()

def process_batch_videos(button):
    with batch_output_widget:
        batch_output_widget.clear_output()
        
        urls_text = batch_urls_widget.value.strip()
        if not urls_text:
            print("Please enter YouTube video URLs")
            return
        
        # Split URLs by lines and filter empty lines
        video_urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
        
        if not video_urls:
            print("No valid URLs found")
            return
        
        print(f"Processing {len(video_urls)} videos...")
        results = generator.generate_batch_subtitles(video_urls)
        
        if results:
            print(f"\n✅ Successfully processed {len(results)}/{len(video_urls)} videos")
            
            output_format = output_format_widget.value
            
            # Save in requested formats
            download_links = []
            
            if output_format in ['csv', 'all']:
                csv_path = generator.save_results_csv(results)
                download_links.append(('CSV file', str(csv_path)))
                print(f"📊 CSV file saved: {csv_path}")
            
            if output_format in ['json', 'all']:
                json_path = generator.save_results_json(results)
                download_links.append(('JSON file', str(json_path)))
                print(f"📋 JSON file saved: {json_path}")
            
            print(f"\n📁 Individual text files saved in: {config.output_directory}")
            
            # Show download links
            if download_links:
                print("\n📥 Download files:")
                for name, path in download_links:
                    display(HTML(f'<a href="{path}" download>{name}</a><br>'))
            
            # Show summary
            print("\n📊 Processing Summary:")
            for i, result in enumerate(results, 1):
                print(f"{i}. {result['title'][:60]}... - {len(result['transcript'])} characters")
        else:
            print("❌ No videos were successfully processed")

batch_process_button.on_click(process_batch_videos)

display(batch_urls_widget)
display(output_format_widget)
display(batch_process_button)
display(batch_output_widget)

## File Management

View and download generated files:

In [None]:
# List all generated files
def list_generated_files():
    output_dir = config.output_directory
    if not output_dir.exists():
        print("No output directory found")
        return
    
    files = list(output_dir.glob('*'))
    if not files:
        print("No files generated yet")
        return
    
    print(f"Generated files in {output_dir}:")
    print("=" * 50)
    
    for file_path in sorted(files):
        file_size = file_path.stat().st_size
        file_size_mb = file_size / (1024 * 1024)
        print(f"📄 {file_path.name} ({file_size_mb:.2f} MB)")
        if file_path.suffix in ['.txt', '.csv', '.json']:
            display(HTML(f'<a href="{file_path}" download>Download {file_path.name}</a>'))
    
    return files

# Create a button to refresh file list
refresh_button = widgets.Button(
    description='Refresh File List',
    button_style='info'
)

file_list_output = widgets.Output()

def refresh_files(button):
    with file_list_output:
        file_list_output.clear_output()
        list_generated_files()

refresh_button.on_click(refresh_files)

display(refresh_button)
display(file_list_output)

# Initial file list
with file_list_output:
    list_generated_files()

In [None]:
# Create a zip file with all generated files for easy download
def create_zip_download():
    output_dir = config.output_directory
    files = list(output_dir.glob('*'))
    
    if not files:
        print("No files to zip")
        return
    
    zip_path = output_dir / f"subtitles_batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
    
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file_path in files:
            if file_path.is_file() and not file_path.name.endswith('.zip'):
                zipf.write(file_path, file_path.name)
    
    print(f"✅ Created zip file: {zip_path.name}")
    display(HTML(f'<a href="{zip_path}" download>Download All Files (ZIP)</a>'))
    return zip_path

zip_button = widgets.Button(
    description='Create ZIP Download',
    button_style='warning'
)

zip_output = widgets.Output()

def create_zip(button):
    with zip_output:
        zip_output.clear_output()
        create_zip_download()

zip_button.on_click(create_zip)

display(zip_button)
display(zip_output)

## Tips and Best Practices

1. **For better accuracy**: Use larger Whisper models (medium/large) for better transcription quality
2. **For faster processing**: Use smaller models (tiny/base) for quicker results
3. **Batch processing**: Process videos in smaller batches to avoid timeouts
4. **File management**: Download your files regularly as Colab sessions are temporary
5. **URL formats**: Both youtube.com and youtu.be URLs are supported

## Troubleshooting

- If a video fails to process, check if the URL is valid and the video is publicly accessible
- For very long videos, consider using a smaller Whisper model to reduce processing time
- If you encounter memory issues, restart the runtime and process fewer videos at once