<a href="https://colab.research.google.com/github/meizhong986/WhisperJAV/blob/main/notebook/WhisperJAV_colab_edition_expert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WhisperJAV Colab Edition v1.7.4 (Expert)

| User Mode | What it does | Speed |
|------|--------------|-------|
| **Standard** | Processes your video once | Faster |
| **Two-Step** | Processes twice and combines for better accuracy | Slower |

| Option | What it controls |
|--------|------------------|
| **Scene Detection** | How to split audio into chunks (auditok, silero, semantic) |
| **Speech Segmenter** | How to detect speech in audio (silero, ten) |
| **Speech Enhancer** | Audio cleanup for noisy sources (ffmpeg-dsp, clearvoice, etc.) |
| **Model** | Which AI model to use (large-v2, large-v3, turbo, kotoba) |

---
<div style="font-size: 8px; line-height: 1.0;">
1. Upload your videos to <code>Google Drive/WhisperJAV/</code><br>
2. Run <b>Step 1: Settings</b> (required)<br>
3. Run <b>Step 1.5: Expert Options</b> (optional - skip if unsure)<br>
4. Run <b>Step 2: Transcribe</b> and wait for completion<br>
5. Run <b>Step 3: AI Translation</b> (if selected)
</div>

<small>The notebook will automatically disconnect when finished to save your GPU credits.</small>

In [None]:
#@title Step 1: Settings { display-mode: "form" }

#@markdown **Transcription**
quality = "balanced" #@param ["faster", "fast", "balanced", "fidelity", "transformers"]
speech_detection = "aggressive" #@param ["conservative", "balanced", "aggressive"]
speech_segmenter = "automatic" #@param ["automatic", "silero", "ten", "none"]
model = "automatic" #@param ["automatic", "large-v2", "large-v3", "turbo", "kotoba-bilingual", "kotoba-v2.0", "kotoba-v2.1", "kotoba-v2.2"]

#@markdown ---
#@markdown **Two-Step Processing** *(slower but more accurate)*
use_two_step = False #@param {type:"boolean"}
secondpass_quality = "transformers" #@param ["faster", "fast", "balanced", "fidelity", "transformers"]
secondpass_sensitivity = "aggressive" #@param ["conservative", "balanced", "aggressive"]
secondpass_speech_segmenter = "automatic" #@param ["automatic", "silero", "ten", "none"]
secondpass_model = "automatic" #@param ["automatic", "large-v2", "large-v3", "turbo", "kotoba-bilingual", "kotoba-v2.0", "kotoba-v2.1", "kotoba-v2.2"]
merge_method = "prefer first step" #@param ["automatic", "keep all", "prefer first step", "prefer second step"]

#@markdown ---
#@markdown **Files & Output**
folder_name = "WhisperJAV" #@param {type:"string"}
subtitle_language = "Japanese" #@param ["Japanese", "English (auto-translate)", "English (AI translate)"]

#@markdown ---
#@markdown **AI Translation** *(if selected "English (AI translate)")*
translation_service = "deepseek" #@param ["deepseek", "openrouter", "gemini", "claude", "gpt"]
api_key = "" #@param {type:"string"}
translation_style = "standard" #@param ["standard", "explicit"]

#@markdown ---
#@markdown **Credits**
opening_credit = "" #@param {type:"string"}
closing_credit = "Subs by WhisperJAV" #@param {type:"string"}

#@markdown ---
#@markdown **Session**
auto_disconnect = True #@param {type:"boolean"}
#@markdown ☝️ Auto-disconnect when done (saves GPU credits)

# Mapping dictionaries
combine_map = {"automatic": "smart_merge", "keep all": "full_merge",
               "prefer first step": "pass1_primary", "prefer second step": "pass2_primary"}
language_map = {"Japanese": "native", "English (auto-translate)": "direct-to-english",
                "English (AI translate)": "llm"}
tone_map = {"standard": "standard", "explicit": "pornify"}

# Speech segmenter mapping (None = use pipeline default)
segmenter_map = {"automatic": None, "silero": "silero", "ten": "ten", "none": "none"}

# Model mapping (None = use pipeline default)
model_map = {
    "automatic": None,
    "large-v2": "large-v2",
    "large-v3": "large-v3",
    "turbo": "large-v3-turbo",
    "kotoba-bilingual": "kotoba-tech/kotoba-whisper-bilingual-v1.0",
    "kotoba-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
    "kotoba-v2.1": "kotoba-tech/kotoba-whisper-v2.1",
    "kotoba-v2.2": "kotoba-tech/kotoba-whisper-v2.2"
}

# Define model compatibility:
# - Kotoba models (HuggingFace) ONLY work with "transformers" pipeline
# - Legacy models (large-v2/v3/turbo) work with ALL pipelines (faster, fast, balanced, fidelity, transformers)
KOTOBA_MODELS = {"kotoba-bilingual", "kotoba-v2.0", "kotoba-v2.1", "kotoba-v2.2"}
LEGACY_PIPELINES = {"faster", "fast", "balanced", "fidelity"}

# Auto-correct incompatible model-pipeline combinations
warnings_list = []

# Check Pass 1 compatibility
if model in KOTOBA_MODELS and quality in LEGACY_PIPELINES:
    warnings_list.append(f"Pass 1: {model} requires 'transformers' pipeline. Auto-correcting from '{quality}' to 'transformers'.")
    quality = "transformers"

# Check Pass 2 compatibility (only relevant if two-step is enabled)
if use_two_step and secondpass_model in KOTOBA_MODELS and secondpass_quality in LEGACY_PIPELINES:
    warnings_list.append(f"Pass 2: {secondpass_model} requires 'transformers' pipeline. Auto-correcting from '{secondpass_quality}' to 'transformers'.")
    secondpass_quality = "transformers"

WHISPERJAV_CONFIG = {
    'use_two_step': use_two_step,
    'pass1_pipeline': quality,
    'pass1_sensitivity': speech_detection,
    'pass1_speech_segmenter': segmenter_map[speech_segmenter],
    'pass1_model': model_map[model],
    'pass2_pipeline': secondpass_quality,
    'pass2_sensitivity': secondpass_sensitivity,
    'pass2_speech_segmenter': segmenter_map[secondpass_speech_segmenter],
    'pass2_model': model_map[secondpass_model],
    'merge_strategy': combine_map[merge_method],
    'folder_name': folder_name,
    'subtitle_language': language_map[subtitle_language],
    'translation_service': translation_service,
    'api_key': api_key,
    'translation_style': tone_map[translation_style],
    'opening_credit': opening_credit,
    'closing_credit': closing_credit,
    'auto_disconnect': auto_disconnect,
    # Display values (for status messages)
    '_quality': quality,
    '_speech_detection': speech_detection,
    '_speech_segmenter': speech_segmenter,
    '_model': model,
    '_secondpass_quality': secondpass_quality,
    '_secondpass_sensitivity': secondpass_sensitivity,
    '_secondpass_speech_segmenter': secondpass_speech_segmenter,
    '_secondpass_model': secondpass_model,
    '_merge_method': merge_method,
    '_subtitle_language': subtitle_language,
    '_translation_style': translation_style,
}

from IPython.display import display, HTML

# Display any auto-correction warnings
for warning in warnings_list:
    display(HTML(f'<div style="padding:6px 10px;background:#fef9c3;border-radius:4px;font-size:10px;margin-bottom:4px"><b>⚠ Auto-corrected:</b> {warning}</div>'))

# Build status display
if use_two_step:
    mode_text = "Two-Step"
    p1_info = f"{quality}"
    if speech_segmenter != "automatic":
        p1_info += f"/{speech_segmenter}"
    if model != "automatic":
        p1_info += f"/{model}"
    p2_info = f"{secondpass_quality}"
    if secondpass_speech_segmenter != "automatic":
        p2_info += f"/{secondpass_speech_segmenter}"
    if secondpass_model != "automatic":
        p2_info += f"/{secondpass_model}"
    details = f"{p1_info} → {p2_info}"
else:
    mode_text = "Standard"
    details = f"{quality}/{speech_detection}"
    if speech_segmenter != "automatic":
        details += f"/{speech_segmenter}"
    if model != "automatic":
        details += f"/{model}"

display(HTML(f'<div style="padding:6px 10px;background:#f0f9ff;border-radius:4px;font-size:10px"><b>Settings:</b> {mode_text} ({details}) | Folder: {folder_name} | Output: {subtitle_language}</div>'))

In [None]:
#@title Step 1.5: Expert Options (Optional) { display-mode: "form" }
#@markdown <font color="gray">*Skip this cell if unsure. Default settings work well for most videos.*</font>
#@markdown
#@markdown <font color="orange">⚠️ **Memory Notice:** Speech enhancement uses additional GPU memory. If you encounter OOM errors, use `none` or `ffmpeg-dsp`.</font>

#@markdown ---
#@markdown ## Pass 1 Settings

#@markdown **Scene Detection** *(how to split audio into chunks)*
pass1_scene_detector = "automatic" #@param ["automatic", "auditok", "silero", "semantic", "none"]
#@markdown <font size="1">auditok=energy-based (fast), silero=VAD-based, semantic=texture clustering (best for complex audio)</font>

#@markdown **Speech Segmenter** *(how to detect speech within chunks)*
pass1_speech_segmenter = "automatic" #@param ["automatic", "silero", "ten", "none"]

#@markdown **Speech Enhancer** *(audio cleanup for noisy sources)*
pass1_speech_enhancer = "none" #@param ["none", "ffmpeg-dsp", "clearvoice", "zipenhancer", "bs-roformer"]
#@markdown <font size="1">none=skip, ffmpeg-dsp=filters (no GPU), clearvoice=denoise (48kHz), zipenhancer=lightweight, bs-roformer=vocal isolation</font>

#@markdown **FFmpeg DSP Filters** *(only applies when ffmpeg-dsp selected above)*
pass1_ffmpeg_amplify = True #@param {type:"boolean"}
#@markdown ↳ Amplify quiet audio (recommended)
pass1_ffmpeg_loudnorm = False #@param {type:"boolean"}
#@markdown ↳ Loudness normalization
pass1_ffmpeg_compress = False #@param {type:"boolean"}
#@markdown ↳ Dynamic range compression
pass1_ffmpeg_highpass = False #@param {type:"boolean"}
#@markdown ↳ Remove low rumble (<80Hz)

#@markdown ---
#@markdown ## Pass 2 Settings *(only applies when Two-Step is enabled in Step 1)*

#@markdown **Scene Detection**
pass2_scene_detector = "automatic" #@param ["automatic", "auditok", "silero", "semantic", "none"]

#@markdown **Speech Segmenter**
pass2_speech_segmenter = "automatic" #@param ["automatic", "silero", "ten", "none"]

#@markdown **Speech Enhancer**
pass2_speech_enhancer = "none" #@param ["none", "ffmpeg-dsp", "clearvoice", "zipenhancer", "bs-roformer"]

#@markdown **FFmpeg DSP Filters** *(only applies when ffmpeg-dsp selected above)*
pass2_ffmpeg_amplify = True #@param {type:"boolean"}
#@markdown ↳ Amplify quiet audio (recommended)
pass2_ffmpeg_loudnorm = False #@param {type:"boolean"}
#@markdown ↳ Loudness normalization
pass2_ffmpeg_compress = False #@param {type:"boolean"}
#@markdown ↳ Dynamic range compression
pass2_ffmpeg_highpass = False #@param {type:"boolean"}
#@markdown ↳ Remove low rumble (<80Hz)

# ═══════════════════════════════════════════
# BUILD EXPERT CONFIG
# ═══════════════════════════════════════════

def build_ffmpeg_filters(amplify, loudnorm, compress, highpass):
    """Combine selected FFmpeg filters into comma-separated string."""
    filters = []
    if amplify:
        filters.append("amplify")
    if loudnorm:
        filters.append("loudnorm")
    if compress:
        filters.append("compress")
    if highpass:
        filters.append("highpass")
    return ",".join(filters) if filters else None

# Map "automatic" to None (let CLI use pipeline defaults)
def map_value(val):
    return None if val == "automatic" else val

WHISPERJAV_EXPERT_CONFIG = {
    # Pass 1
    'pass1_scene_detector': map_value(pass1_scene_detector),
    'pass1_speech_segmenter': map_value(pass1_speech_segmenter),
    'pass1_speech_enhancer': None if pass1_speech_enhancer == "none" else pass1_speech_enhancer,
    'pass1_ffmpeg_filters': build_ffmpeg_filters(pass1_ffmpeg_amplify, pass1_ffmpeg_loudnorm, pass1_ffmpeg_compress, pass1_ffmpeg_highpass) if pass1_speech_enhancer == "ffmpeg-dsp" else None,
    # Pass 2
    'pass2_scene_detector': map_value(pass2_scene_detector),
    'pass2_speech_segmenter': map_value(pass2_speech_segmenter),
    'pass2_speech_enhancer': None if pass2_speech_enhancer == "none" else pass2_speech_enhancer,
    'pass2_ffmpeg_filters': build_ffmpeg_filters(pass2_ffmpeg_amplify, pass2_ffmpeg_loudnorm, pass2_ffmpeg_compress, pass2_ffmpeg_highpass) if pass2_speech_enhancer == "ffmpeg-dsp" else None,
    # Display values
    '_pass1_scene_detector': pass1_scene_detector,
    '_pass1_speech_segmenter': pass1_speech_segmenter,
    '_pass1_speech_enhancer': pass1_speech_enhancer,
    '_pass2_scene_detector': pass2_scene_detector,
    '_pass2_speech_segmenter': pass2_speech_segmenter,
    '_pass2_speech_enhancer': pass2_speech_enhancer,
}

from IPython.display import display, HTML

# Validation warnings
warnings = []

# Check for memory-intensive combinations
heavy_enhancers = {'clearvoice', 'bs-roformer', 'zipenhancer'}
if pass1_speech_enhancer in heavy_enhancers and pass2_speech_enhancer in heavy_enhancers:
    warnings.append("Using GPU-based enhancement on both passes may cause OOM on T4 GPU. Consider using 'none' or 'ffmpeg-dsp' for one pass.")

# Display warnings
for w in warnings:
    display(HTML(f'<div style="padding:4px 8px;background:#fef9c3;border-radius:4px;font-size:9px;margin:2px 0"><b>⚠️</b> {w}</div>'))

# Build summary
def summarize_pass(scene, seg, enh, ffmpeg_filters):
    parts = []
    if scene != "automatic":
        parts.append(f"scene:{scene}")
    if seg != "automatic":
        parts.append(f"seg:{seg}")
    if enh != "none":
        if enh == "ffmpeg-dsp" and ffmpeg_filters:
            parts.append(f"enh:{enh}({ffmpeg_filters})")
        else:
            parts.append(f"enh:{enh}")
    return ", ".join(parts) if parts else "defaults"

p1_summary = summarize_pass(pass1_scene_detector, pass1_speech_segmenter, pass1_speech_enhancer,
                            WHISPERJAV_EXPERT_CONFIG['pass1_ffmpeg_filters'])
p2_summary = summarize_pass(pass2_scene_detector, pass2_speech_segmenter, pass2_speech_enhancer,
                            WHISPERJAV_EXPERT_CONFIG['pass2_ffmpeg_filters'])

display(HTML(f'<div style="padding:6px 10px;background:#f5f3ff;border-radius:4px;font-size:10px"><b>Expert Settings:</b> Pass 1: {p1_summary} | Pass 2: {p2_summary}</div>'))
print("\n✓ Expert options configured. Run Step 2 to transcribe.")

In [None]:
#@title Step 2: Transcribe { display-mode: "form" }
#@markdown Connect Drive → Install → Transcribe all media files → Add credits

import os, sys, subprocess, shlex, time
from pathlib import Path
from IPython.display import display, HTML, clear_output

def status(msg, ok=True):
    icon = "✓" if ok else "✗"
    print(f"{icon} {msg}")

def section(title):
    print(f"\n{'─'*40}\n{title}\n{'─'*40}")

# Check config
if 'WHISPERJAV_CONFIG' not in dir():
    display(HTML('<div style="padding:8px;background:#fef2f2;border-radius:4px;color:#991b1b;font-size:10px"><b>Error:</b> Run Step 1 first</div>'))
    raise SystemExit()
cfg = WHISPERJAV_CONFIG

# Check for expert config (optional - defaults if not run)
expert = WHISPERJAV_EXPERT_CONFIG if 'WHISPERJAV_EXPERT_CONFIG' in dir() else None

# ═══════════════════════════════════════════
# CONNECT GOOGLE DRIVE
# ═══════════════════════════════════════════
section("CONNECTING GOOGLE DRIVE")
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    folder_path = Path(f"/content/drive/MyDrive/{cfg['folder_name']}")
    folder_path.mkdir(parents=True, exist_ok=True)
    status(f"Connected: {folder_path}")
except Exception as e:
    status(f"Failed to connect: {e}", False)
    raise SystemExit()

# ═══════════════════════════════════════════
# CHECK GPU
# ═══════════════════════════════════════════
section("CHECKING GPU")
gpu_check = subprocess.run("nvidia-smi --query-gpu=name,memory.total --format=csv,noheader", shell=True, capture_output=True, text=True)
if gpu_check.returncode != 0 or not gpu_check.stdout.strip():
    status("No GPU detected. Go to Runtime → Change runtime type → T4 GPU", False)
    raise SystemExit()
status(f"GPU: {gpu_check.stdout.strip()}")

# ═══════════════════════════════════════════
# INSTALL WHISPERJAV
# ═══════════════════════════════════════════
section("INSTALLING (2-3 min)")
install_start = time.time()

# Base installation steps
steps = [
    ("apt-get update -qq && apt-get install -y -qq ffmpeg portaudio19-dev libc++1 libc++abi1 > /dev/null 2>&1", "System tools"),
    ("pip install -q tqdm numba tiktoken ffmpeg-python soundfile auditok numpy scipy pysrt srt aiofiles jsonschema Pillow colorama librosa matplotlib pyloudnorm requests faster-whisper transformers optimum accelerate huggingface-hub pydantic ten-vad silero-vad pydub regex modelscope addict", "Python packages"),
    ("pip install -q --no-deps git+https://github.com/openai/whisper.git@main", "Whisper"),
    ("pip install -q --no-deps git+https://github.com/meizhong986/stable-ts-fix-setup.git@main", "Stable-TS"),
    ("pip install -q git+https://github.com/meizhong986/WhisperJAV.git@main", "WhisperJAV")
]

for cmd, name in steps:
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        status(f"{name} failed", False)
        raise SystemExit()
    status(name)

# Conditional installation of speech enhancer dependencies
if expert:
    extra_packages = set()
    for enhancer in [expert.get('pass1_speech_enhancer'), expert.get('pass2_speech_enhancer')]:
        if enhancer == 'clearvoice':
            extra_packages.add('clearvoice')
        elif enhancer == 'zipenhancer':
            # zipenhancer uses modelscope which is already installed above
            pass
        elif enhancer == 'bs-roformer':
            extra_packages.add('bs-roformer-infer')
    
    if extra_packages:
        pkg_list = ' '.join(extra_packages)
        result = subprocess.run(f"pip install -q {pkg_list}", shell=True, capture_output=True, text=True)
        if result.returncode != 0:
            status(f"Speech enhancer packages failed (continuing anyway)", False)
        else:
            status(f"Speech enhancer packages ({', '.join(extra_packages)})")

status(f"Installation complete ({time.time()-install_start:.0f}s)")

# ═══════════════════════════════════════════
# FIND MEDIA FILES
# ═══════════════════════════════════════════
section("SCANNING FILES")
video_types = {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm', '.m4v', '.mp3', '.wav', '.flac', '.m4a'}
videos = [f for f in folder_path.iterdir() if f.suffix.lower() in video_types]

if not videos:
    status(f"No media files in {cfg['folder_name']}/", False)
    raise SystemExit()

status(f"Found {len(videos)} file(s)")
for v in videos[:5]:
    print(f"  • {v.name}")
if len(videos) > 5:
    print(f"  ... and {len(videos)-5} more")

# Record existing SRT files before transcription
existing_srts = set(folder_path.glob('*.srt'))

# ═══════════════════════════════════════════
# TRANSCRIBE
# ═══════════════════════════════════════════
section("TRANSCRIBING")

# Build command - always transcribe to native (Japanese) first
# For "direct-to-english", use Whisper's built-in translation
# For "llm", transcribe to Japanese and translate in Step 3
cmd = ['whisperjav', str(folder_path), '--output-dir', str(folder_path)]

if cfg['use_two_step']:
    # Two-step ensemble mode
    cmd.extend(['--ensemble',
        '--pass1-pipeline', cfg['pass1_pipeline'],
        '--pass1-sensitivity', cfg['pass1_sensitivity'],
        '--pass2-pipeline', cfg['pass2_pipeline'],
        '--pass2-sensitivity', cfg['pass2_sensitivity'],
        '--merge-strategy', cfg['merge_strategy']])

    # Add speech segmenters from basic config if not automatic
    if cfg['pass1_speech_segmenter']:
        cmd.extend(['--pass1-speech-segmenter', cfg['pass1_speech_segmenter']])
    if cfg['pass2_speech_segmenter']:
        cmd.extend(['--pass2-speech-segmenter', cfg['pass2_speech_segmenter']])

    # Add models if not automatic
    if cfg['pass1_model']:
        cmd.extend(['--pass1-model', cfg['pass1_model']])
    if cfg['pass2_model']:
        cmd.extend(['--pass2-model', cfg['pass2_model']])

    # Add expert options if provided
    if expert:
        # Pass 1 expert settings
        if expert.get('pass1_scene_detector'):
            cmd.extend(['--pass1-scene-detector', expert['pass1_scene_detector']])
        if expert.get('pass1_speech_segmenter'):
            # Expert speech segmenter overrides basic config
            # Remove existing --pass1-speech-segmenter if present and add new one
            if '--pass1-speech-segmenter' in cmd:
                idx = cmd.index('--pass1-speech-segmenter')
                cmd.pop(idx)  # Remove flag
                cmd.pop(idx)  # Remove value
            cmd.extend(['--pass1-speech-segmenter', expert['pass1_speech_segmenter']])
        if expert.get('pass1_speech_enhancer'):
            cmd.extend(['--pass1-speech-enhancer', expert['pass1_speech_enhancer']])
        
        # Pass 2 expert settings
        if expert.get('pass2_scene_detector'):
            cmd.extend(['--pass2-scene-detector', expert['pass2_scene_detector']])
        if expert.get('pass2_speech_segmenter'):
            # Expert speech segmenter overrides basic config
            if '--pass2-speech-segmenter' in cmd:
                idx = cmd.index('--pass2-speech-segmenter')
                cmd.pop(idx)
                cmd.pop(idx)
            cmd.extend(['--pass2-speech-segmenter', expert['pass2_speech_segmenter']])
        if expert.get('pass2_speech_enhancer'):
            cmd.extend(['--pass2-speech-enhancer', expert['pass2_speech_enhancer']])

    # Display mode info
    p1_info = cfg['_quality']
    if cfg['_speech_segmenter'] != 'automatic':
        p1_info += f" + {cfg['_speech_segmenter']}"
    if cfg['_model'] != 'automatic':
        p1_info += f" ({cfg['_model']})"
    if expert:
        if expert.get('_pass1_scene_detector') != 'automatic':
            p1_info += f" [scene:{expert['_pass1_scene_detector']}]"
        if expert.get('_pass1_speech_enhancer') != 'none':
            p1_info += f" [enh:{expert['_pass1_speech_enhancer']}]"

    p2_info = cfg['_secondpass_quality']
    if cfg['_secondpass_speech_segmenter'] != 'automatic':
        p2_info += f" + {cfg['_secondpass_speech_segmenter']}"
    if cfg['_secondpass_model'] != 'automatic':
        p2_info += f" ({cfg['_secondpass_model']})"
    if expert:
        if expert.get('_pass2_scene_detector') != 'automatic':
            p2_info += f" [scene:{expert['_pass2_scene_detector']}]"
        if expert.get('_pass2_speech_enhancer') != 'none':
            p2_info += f" [enh:{expert['_pass2_speech_enhancer']}]"

    print(f"Mode: Two-Step")
    print(f"  Pass 1: {p1_info}")
    print(f"  Pass 2: {p2_info}")
    print(f"  Merge: {cfg['_merge_method']}")
else:
    # Single-pass mode
    cmd.extend(['--mode', cfg['pass1_pipeline'], '--sensitivity', cfg['pass1_sensitivity']])

    # Add speech segmenter from basic config if not automatic
    if cfg['pass1_speech_segmenter']:
        cmd.extend(['--speech-segmenter', cfg['pass1_speech_segmenter']])

    # Add model if specified
    if cfg['pass1_model']:
        cmd.extend(['--model', cfg['pass1_model']])

    # Add expert options if provided (single-pass uses pass1 settings)
    if expert:
        if expert.get('pass1_scene_detector'):
            cmd.extend(['--scene-detection-method', expert['pass1_scene_detector']])
        if expert.get('pass1_speech_segmenter'):
            # Expert speech segmenter overrides basic config
            if '--speech-segmenter' in cmd:
                idx = cmd.index('--speech-segmenter')
                cmd.pop(idx)
                cmd.pop(idx)
            cmd.extend(['--speech-segmenter', expert['pass1_speech_segmenter']])
        # Note: Single-pass speech enhancer support depends on CLI implementation

    mode_info = f"{cfg['_quality']}/{cfg['_speech_detection']}"
    if cfg['_speech_segmenter'] != 'automatic':
        mode_info += f" + {cfg['_speech_segmenter']}"
    if cfg['_model'] != 'automatic':
        mode_info += f" ({cfg['_model']})"
    if expert:
        if expert.get('_pass1_scene_detector') != 'automatic':
            mode_info += f" [scene:{expert['_pass1_scene_detector']}]"
        if expert.get('_pass1_speech_enhancer') != 'none':
            mode_info += f" [enh:{expert['_pass1_speech_enhancer']}]"
    print(f"Mode: Standard ({mode_info})")

# Set subtitle language for transcription
if cfg['subtitle_language'] == 'direct-to-english':
    cmd.extend(['--subs-language', 'direct-to-english'])
    print(f"Output: English (Whisper auto-translate)")
else:
    # For both 'native' and 'llm', transcribe to Japanese first
    cmd.extend(['--subs-language', 'native'])
    if cfg['subtitle_language'] == 'llm':
        print(f"Output: Japanese (AI translation will follow in Step 3)")
    else:
        print(f"Output: Japanese")

print(f"Input: {folder_path}\n")

full_cmd = shlex.join(cmd)
process = subprocess.Popen(full_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True)
for line in process.stdout:
    print(line, end='')
process.wait()

if process.returncode != 0:
    status("Transcription failed", False)
    raise SystemExit()

# ═══════════════════════════════════════════
# IDENTIFY NEW SRT FILES
# ═══════════════════════════════════════════
all_srts = set(folder_path.glob('*.srt'))
new_srts = list(all_srts - existing_srts)
new_srts.sort(key=lambda x: x.name)

# Store for Step 3
WHISPERJAV_NEW_SRTS = new_srts
WHISPERJAV_FOLDER_PATH = folder_path

status(f"Created {len(new_srts)} new subtitle file(s)")

# ═══════════════════════════════════════════
# ADD CREDITS
# ═══════════════════════════════════════════
section("ADDING CREDITS")

if cfg['opening_credit'] or cfg['closing_credit']:
    credits_count = 0
    for srt_file in new_srts:
        try:
            content = srt_file.read_text(encoding='utf-8')
            if cfg['opening_credit']:
                content = f"0\n00:00:00,000 --> 00:00:00,500\n{cfg['opening_credit']}\n\n" + content
            if cfg['closing_credit']:
                content += f"\n9999\n23:59:58,000 --> 23:59:59,000\n{cfg['closing_credit']}\n"
            srt_file.write_text(content, encoding='utf-8')
            credits_count += 1
        except Exception as e:
            print(f"  Warning: Could not add credits to {srt_file.name}: {e}")
    status(f"Credits added to {credits_count} file(s)")
else:
    status("No credits configured")

# ═══════════════════════════════════════════
# TRANSCRIPTION COMPLETE
# ═══════════════════════════════════════════
section("TRANSCRIPTION COMPLETE")

if cfg['subtitle_language'] == 'llm' and cfg['api_key']:
    display(HTML(f'<div style="padding:8px 10px;background:#fef9c3;border-radius:4px;border-left:2px solid #ca8a04;font-size:10px"><b>✓ Transcription done!</b> {len(new_srts)} file(s). AI Translation will start next...</div>'))
else:
    display(HTML(f'<div style="padding:8px 10px;background:#f0fdf4;border-radius:4px;border-left:2px solid #16a34a;font-size:10px"><b>✓ Done!</b> {len(new_srts)} subtitle(s) saved to Google Drive/{cfg["folder_name"]}/</div>'))
    if cfg['subtitle_language'] == 'llm' and not cfg['api_key']:
        print("Note: AI translation skipped (no API key provided)")

    # Auto-disconnect if no AI translation needed
    if cfg['auto_disconnect']:
        print("\nAuto-disconnecting in 10s to save GPU credits...")
        time.sleep(10)
        try:
            from google.colab import runtime
            runtime.unassign()
        except: pass

In [None]:
#@title Step 3: AI Translation (if selected) { display-mode: "form" }
#@markdown Translate each subtitle file using AI (only runs if "English (AI translate)" selected)

import os, sys, subprocess, shlex, time
from pathlib import Path
from IPython.display import display, HTML

def status(msg, ok=True):
    icon = "✓" if ok else "✗"
    print(f"{icon} {msg}")

def section(title):
    print(f"\n{'─'*40}\n{title}\n{'─'*40}")

# ═══════════════════════════════════════════
# CHECK PREREQUISITES
# ═══════════════════════════════════════════
if 'WHISPERJAV_CONFIG' not in dir():
    display(HTML('<div style="padding:8px;background:#fef2f2;border-radius:4px;color:#991b1b;font-size:10px"><b>Error:</b> Run Step 1 first</div>'))
    raise SystemExit()

if 'WHISPERJAV_NEW_SRTS' not in dir():
    display(HTML('<div style="padding:8px;background:#fef2f2;border-radius:4px;color:#991b1b;font-size:10px"><b>Error:</b> Run Step 2 first</div>'))
    raise SystemExit()

cfg = WHISPERJAV_CONFIG
new_srts = WHISPERJAV_NEW_SRTS
folder_path = WHISPERJAV_FOLDER_PATH

# ═══════════════════════════════════════════
# CHECK IF AI TRANSLATION IS NEEDED
# ═══════════════════════════════════════════
if cfg['subtitle_language'] != 'llm':
    display(HTML('<div style="padding:8px 10px;background:#f0f9ff;border-radius:4px;border-left:2px solid #3b82f6;font-size:10px"><b>ℹ Skipped:</b> AI translation not selected</div>'))
    raise SystemExit()

if not cfg['api_key']:
    display(HTML('<div style="padding:8px;background:#fef2f2;border-radius:4px;color:#991b1b;font-size:10px"><b>Error:</b> No API key provided for AI translation</div>'))
    raise SystemExit()

if not new_srts:
    display(HTML('<div style="padding:8px;background:#fef2f2;border-radius:4px;color:#991b1b;font-size:10px"><b>Error:</b> No subtitle files to translate</div>'))
    raise SystemExit()

# ═══════════════════════════════════════════
# SET UP API KEY
# ═══════════════════════════════════════════
env_map = {
    "deepseek": "DEEPSEEK_API_KEY",
    "openrouter": "OPENROUTER_API_KEY",
    "gemini": "GEMINI_API_KEY",
    "claude": "ANTHROPIC_API_KEY",
    "gpt": "OPENAI_API_KEY"
}
os.environ[env_map.get(cfg['translation_service'], "API_KEY")] = cfg['api_key']

# ═══════════════════════════════════════════
# TRANSLATE EACH SRT FILE
# ═══════════════════════════════════════════
section("AI TRANSLATION")
print(f"Provider: {cfg['translation_service']}")
print(f"Style: {cfg['_translation_style']}")
print(f"Files to translate: {len(new_srts)}\n")

translated_files = []
failed_files = []

for i, srt_file in enumerate(new_srts, 1):
    print(f"[{i}/{len(new_srts)}] Translating: {srt_file.name}")

    # Build whisperjav-translate command
    translate_cmd = [
        'whisperjav-translate',
        '-i', str(srt_file),
        '--provider', cfg['translation_service'],
        '-t', 'english',
        '--tone', cfg['translation_style'],
        '--stream'
    ]

    full_cmd = shlex.join(translate_cmd)

    try:
        process = subprocess.Popen(
            full_cmd,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            bufsize=1,
            universal_newlines=True
        )

        # Capture stdout (output path) and stderr (progress)
        stdout_lines = []
        for line in process.stderr:
            print(f"    {line}", end='')

        stdout_output, _ = process.communicate()

        if process.returncode == 0:
            output_path = stdout_output.strip()
            if output_path:
                translated_files.append(Path(output_path))
            status(f"Completed: {srt_file.name}")
        else:
            status(f"Failed: {srt_file.name}", False)
            failed_files.append(srt_file)

    except Exception as e:
        status(f"Error translating {srt_file.name}: {e}", False)
        failed_files.append(srt_file)

    print()  # Blank line between files

# ═══════════════════════════════════════════
# TRANSLATION COMPLETE
# ═══════════════════════════════════════════
section("COMPLETE")

total_srts = len(new_srts) + len(translated_files)

if failed_files:
    display(HTML(f'<div style="padding:8px 10px;background:#fef9c3;border-radius:4px;border-left:2px solid #ca8a04;font-size:10px"><b>⚠ Partially done!</b> {len(translated_files)}/{len(new_srts)} translated. {len(failed_files)} failed.</div>'))
else:
    display(HTML(f'<div style="padding:8px 10px;background:#f0fdf4;border-radius:4px;border-left:2px solid #16a34a;font-size:10px"><b>✓ All done!</b> {len(new_srts)} Japanese + {len(translated_files)} English subtitle(s) in Google Drive/{cfg["folder_name"]}/</div>'))

# ═══════════════════════════════════════════
# AUTO-DISCONNECT
# ═══════════════════════════════════════════
if cfg['auto_disconnect']:
    print("\nAuto-disconnecting in 10s to save GPU credits...")
    time.sleep(10)
    try:
        from google.colab import runtime
        runtime.unassign()
    except: pass
else:
    print("\nRemember to disconnect manually to save GPU credits.")