<a href="https://colab.research.google.com/github/meizhong986/WhisperJAV/blob/main/notebook/WhisperJAV_colab_edition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WhisperJAV

**Automatically create Japanese subtitles for your videos.**

| Mode | What it does | Speed |
|------|--------------|-------|
| **Standard** | Processes your video once | Faster |
| **Two-Step** | Processes twice and combines for better accuracy | Slower |

---
### How to Use
1. Upload your videos to `Google Drive/WhisperJAV/`
2. Run **Step 1** to set up
3. Adjust **Step 2** settings if needed
4. Run **Step 3** to create subtitles

In [None]:
#@title Step 1: Set Up { display-mode: "form" }
#@markdown ### Click the Play button to install (takes 2-3 minutes)

import os
import sys
import subprocess
import time
from IPython.display import display, HTML, clear_output

def show_progress(msg, icon, color):
    display(HTML(f'<div style="padding:10px 15px;margin:5px 0;border-radius:8px;background:linear-gradient(90deg,{color}22,transparent);border-left:4px solid {color}"><span style="font-size:1.2em">{icon}</span> <b>{msg}</b></div>'))

def run_cmd(cmd, name):
    show_progress(f"Installing {name}...", "üì•", "#3498db")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return result.returncode == 0

print("Setting up WhisperJAV...\n")
start = time.time()

steps = [
    ("apt-get update -qq && apt-get install -y -qq ffmpeg portaudio19-dev > /dev/null 2>&1", "system tools"),
    ("pip install -q tqdm numba tiktoken ffmpeg-python soundfile auditok numpy scipy pysrt srt aiofiles jsonschema Pillow colorama librosa matplotlib pyloudnorm requests faster-whisper transformers optimum accelerate huggingface-hub pydantic", "required packages"),
    ("pip install -q --no-deps git+https://github.com/openai/whisper.git@main", "speech recognition"),
    ("pip install -q --no-deps git+https://github.com/meizhong986/stable-ts-fix-setup.git@main", "timing tools"),
    ("pip install -q git+https://github.com/meizhong986/WhisperJAV.git@main", "WhisperJAV")
]

for cmd, name in steps:
    if not run_cmd(cmd, name):
        show_progress(f"Failed: {name}", "‚ùå", "#e74c3c")
        sys.exit(1)

clear_output()
elapsed = time.time() - start

import torch
gpu_info, gpu_color = ("Not available (will be slower)", "#f39c12")
if torch.cuda.is_available():
    gpu_info = f"{torch.cuda.get_device_name(0)} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)"
    gpu_color = "#2ecc71"

WHISPERJAV_INSTALLED = True

display(HTML(f'''
<div style="background:linear-gradient(135deg,#1a1a2e,#16213e);padding:25px;border-radius:15px;color:white">
    <h2 style="margin:0 0 15px 0;color:#00d4aa">Ready to Go!</h2>
    <div style="display:grid;grid-template-columns:1fr 1fr;gap:15px">
        <div style="background:#ffffff15;padding:12px;border-radius:8px">
            <div style="color:#888;font-size:0.85em">Setup Time</div>
            <div style="font-size:1.3em;font-weight:bold">{elapsed:.0f} seconds</div>
        </div>
        <div style="background:#ffffff15;padding:12px;border-radius:8px">
            <div style="color:#888;font-size:0.85em">Graphics Card</div>
            <div style="font-size:1em;font-weight:bold;color:{gpu_color}">{gpu_info}</div>
        </div>
    </div>
</div>
'''))

In [None]:
#@title Step 2: Choose Your Settings { display-mode: "form" }
#@markdown ---
#@markdown ## Transcription Settings
#@markdown *These control how your video is processed*
quality = "balanced" #@param ["faster", "fast", "balanced", "fidelity", "transformers"]
speech_detection = "aggressive" #@param ["conservative", "balanced", "aggressive"]

#@markdown ---
use_two_step = False #@param {type:"boolean"}
#@markdown ‚òùÔ∏è **Use Two-Step Processing** ‚Äî Takes longer but more accurate


#@markdown *Only used when Two-Step Processing is enabled*
2ndpass_quality = "transformers" #@param ["faster", "fast", "balanced", "fidelity", "transformers"]
2ndpass_sensitivity = "aggressive" #@param ["conservative", "balanced", "aggressive"]
merge_method = "prefer first step" #@param ["automatic", "keep all", "prefer first step", "prefer second step"]

#@markdown ---
#@markdown ## Where Are Your Files?
folder_name = "WhisperJAV" #@param {type:"string"}
#@markdown ‚òùÔ∏è Folder name in your Google Drive

#@markdown ---
#@markdown ## Subtitle Language
subtitle_language = "Japanese" #@param ["Japanese", "English (auto-translate)", "English (AI translate)"]

#@markdown ---
#@markdown ## AI Translation Settings
#@markdown *Only needed if you chose "English (AI translate)" above*
translation_service = "deepseek" #@param ["deepseek", "openrouter", "gemini", "claude", "gpt"]
api_key = "" #@param {type:"string"}
translation_style = "standard" #@param ["standard", "explicit"]

#@markdown ---

# Map user-friendly names to internal values (only for non-WhisperJAV terms)
combine_map = {"automatic": "smart_merge", "keep all": "full_merge",
               "prefer first step": "pass1_primary",
               "prefer second step": "pass2_primary"}
language_map = {"Japanese": "native", "English (auto-translate)": "direct-to-english",
                "English (AI translate)": "llm"}

WHISPERJAV_CONFIG = {
    'use_two_step': use_two_step,
    'pass1_pipeline': quality,
    'pass1_sensitivity': speech_detection,
    'pass2_pipeline': 2ndpass_quality,
    'pass2_sensitivity': 2ndpass_sensitivity,
    'merge_strategy': combine_map[merge_method],
    'folder_name': folder_name,
    'subtitle_language': language_map[subtitle_language],
    'translation_service': translation_service,
    'api_key': api_key,
    'translation_style': translation_style,
    # Keep display values for summary
    '_quality': quality,
    '_speech_detection': speech_detection,
    '_2ndpass_quality': 2ndpass_quality,
    '_2ndpass_sensitivity': 2ndpass_sensitivity,
    '_merge_method': merge_method,
    '_subtitle_language': subtitle_language,
}

from IPython.display import display, HTML

if use_two_step:
    mode_text = "Two-Step Processing"
    details = f"{quality} ‚Üí {2ndpass_quality} ‚Üí {merge_method}"
    mode_color = "#8b5cf6"
else:
    mode_text = "Standard Processing"
    details = f"{quality} quality, {speech_detection} detection"
    mode_color = "#3b82f6"

lang_display = subtitle_language
if subtitle_language == "English (AI translate)":
    lang_display = f"English via {translation_service}" if api_key else "Japanese (no API key provided)"

display(HTML(f'''
<div style="background:linear-gradient(135deg,#1e293b,#334155);padding:20px;border-radius:12px;color:white">
    <div style="display:flex;align-items:center;gap:10px;margin-bottom:15px">
        <span style="background:{mode_color};padding:4px 12px;border-radius:20px;font-size:0.85em">{mode_text}</span>
        <span style="color:#94a3b8">Settings saved</span>
    </div>
    <table style="color:white;border-collapse:collapse;width:100%">
        <tr><td style="padding:6px 0;color:#64748b;width:100px">Settings:</td><td>{details}</td></tr>
        <tr><td style="padding:6px 0;color:#64748b">Folder:</td><td>Google Drive/{folder_name}/</td></tr>
        <tr><td style="padding:6px 0;color:#64748b">Subtitles:</td><td>{lang_display}</td></tr>
    </table>
</div>
'''))

In [None]:
#@title Step 3: Create Subtitles { display-mode: "form" }
#@markdown ### Click Play to process your videos

import os
import sys
import shlex
from pathlib import Path
from google.colab import drive
from IPython.display import display, HTML

# Check prerequisites
if 'WHISPERJAV_INSTALLED' not in dir():
    display(HTML('<div style="padding:15px;background:#fef2f2;border-radius:8px;border-left:4px solid #ef4444;color:#991b1b"><b>Please run Step 1 first</b></div>'))
    raise SystemExit()

if 'WHISPERJAV_CONFIG' not in dir():
    display(HTML('<div style="padding:15px;background:#fef2f2;border-radius:8px;border-left:4px solid #ef4444;color:#991b1b"><b>Please run Step 2 first</b></div>'))
    raise SystemExit()

cfg = WHISPERJAV_CONFIG

# Connect to Google Drive
drive.mount('/content/drive', force_remount=False)
folder_path = Path(f"/content/drive/MyDrive/{cfg['folder_name']}")
folder_path.mkdir(parents=True, exist_ok=True)

# Find video files
video_types = {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv', '.webm', '.m4v', '.mp3', '.wav', '.flac', '.m4a'}
videos = [f for f in folder_path.iterdir() if f.suffix.lower() in video_types]

if not videos:
    display(HTML(f'''
    <div style="padding:15px;background:#fefce8;border-radius:8px;border-left:4px solid #eab308;color:#854d0e">
        <b>No videos found</b><br>
        Please upload your videos to: <b>Google Drive/{cfg['folder_name']}/</b>
    </div>
    '''))
    raise SystemExit()

print(f"Found {len(videos)} video(s) in Google Drive/{cfg['folder_name']}/")

# Build the command
cmd = ['whisperjav', str(folder_path), '--output-dir', str(folder_path)]

if cfg['use_two_step']:
    cmd.extend([
        '--ensemble',
        '--pass1-pipeline', cfg['pass1_pipeline'],
        '--pass1-sensitivity', cfg['pass1_sensitivity'],
        '--pass2-pipeline', cfg['pass2_pipeline'],
        '--pass2-sensitivity', cfg['pass2_sensitivity'],
        '--merge-strategy', cfg['merge_strategy']
    ])
    print(f"\nUsing Two-Step Processing:")
    print(f"  Step 1: {cfg['_quality']} quality, {cfg['_speech_detection']} detection")
    print(f"  Step 2: {cfg['_2ndpass_quality']} quality, {cfg['_2ndpass_sensitivity']} detection")
    print(f"  Combining: {cfg['_merge_method']}")
else:
    cmd.extend(['--mode', cfg['pass1_pipeline'], '--sensitivity', cfg['pass1_sensitivity']])
    print(f"\nUsing Standard Processing:")
    print(f"  Quality: {cfg['_quality']}")
    print(f"  Speech Detection: {cfg['_speech_detection']}")

# Handle subtitle language
if cfg['subtitle_language'] == 'direct-to-english':
    cmd.extend(['--subs-language', 'direct-to-english'])
    print(f"  Subtitles: English (auto-translate)")
elif cfg['subtitle_language'] == 'llm' and cfg['api_key']:
    cmd.extend(['--subs-language', 'native', '--translate', '--translate-provider', cfg['translation_service'], '--translate-tone', cfg['translation_style']])
    env_map = {"deepseek": "DEEPSEEK_API_KEY", "openrouter": "OPENROUTER_API_KEY", "gemini": "GEMINI_API_KEY", "claude": "ANTHROPIC_API_KEY", "gpt": "OPENAI_API_KEY"}
    os.environ[env_map.get(cfg['translation_service'], "API_KEY")] = cfg['api_key']
    print(f"  Subtitles: Japanese + English ({cfg['translation_service']})")
else:
    cmd.extend(['--subs-language', 'native'])
    print(f"  Subtitles: Japanese")

print("\n" + "="*50)
print("CREATING SUBTITLES")
print("="*50 + "\n")

!{shlex.join(cmd)}

subtitle_count = len(list(folder_path.glob("*.srt")))
display(HTML(f'''
<div style="background:linear-gradient(135deg,#14532d,#166534);padding:20px;border-radius:12px;color:white;margin-top:20px">
    <h3 style="margin:0 0 8px 0">All Done!</h3>
    <p style="margin:0;color:#bbf7d0">Your subtitles are in: <b>Google Drive/{cfg['folder_name']}/</b></p>
    <p style="margin:5px 0 0 0;color:#86efac">{subtitle_count} subtitle file(s) created</p>
</div>
'''))