# 🈺 Real-Time Japanese-English S2ST (Phase 2)
Streaming ASR → Incremental NMT → Streaming TTS

In [None]:
!pip install -r requirements.txt

In [None]:
# Imports
from asr.streaming_whisper_asr import StreamingWhisperASR
from nmt.incremental_mt5_translation import IncrementalMT5Translator
from tts.streaming_tts import StreamingTTS
from utils.audio_io import AudioStreamHandler, play_audio
from utils.japanese_utils import tokenize_japanese, detect_honorific
import time, tempfile, soundfile as sf


In [None]:
# Initialize ASR, NMT, TTS
asr = StreamingWhisperASR(model_size="medium", compute_type="int8")
translator = IncrementalMT5Translator()
tts = StreamingTTS()
audio_handler = AudioStreamHandler()


In [None]:
# 🔁 Replace real-time mic loop with file-based inference for Colab
import time
import tempfile
import soundfile as sf

# 📂 Path to your uploaded or generated test audio file
audio_chunk_fp = "/content/sample_jp.wav"  # <-- Change if needed

# 🧠 Run the pipeline
start_time = time.time()

jp_texts = asr.stream_transcribe(audio_chunk_fp)
jp_text = " ".join(jp_texts)
print("📝 JP:", jp_text)

register = detect_honorific(jp_text)
print("📛 Register:", register)

en_text = translator.translate_incremental(jp_text)
print("🌐 EN:", en_text)

# 🗣️ TTS synthesis
out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
tts.stream_synthesize(en_text, out_path)
play_audio(out_path)

latency = round((time.time() - start_time) * 1000, 2)
print(f"⚡ Latency: {latency} ms")

results = [{
    "jp": jp_text, "en": en_text, "latency_ms": latency, "register": register
}]

print("✅ Done.")


In [None]:
# ⏱️ Metrics Summary
from statistics import mean
latencies = [r['latency_ms'] for r in results]
print(f"🔁 Average Latency: {mean(latencies):.2f} ms")
print("Translation Outputs:")
for r in results:
    print(f"🈶 JP: {r['jp']} → 🗣 EN: {r['en']} ({r['latency_ms']} ms)")


In [None]:
# [Optional] Translation Metric Evaluation — Add Ref/Hyp Pairs if Available
# from evaluate import load
# metric = load("bleu")
# ref = ["Hello everyone"]
# hyp = ["Hi all"]
# results = metric.compute(predictions=hyp, references=[[r] for r in ref])
# print("BLEU:", results['bleu'])


## 🔄 Future: Direct S2ST with Translatotron 2
- ESPnet or Google Research repo
- Requires pre-trained speech-to-speech model
- Integration work planned for Phase 3
