# 🈺 Real-Time Japanese-English S2ST (Phase 2)
Streaming ASR → Incremental NMT → Streaming TTS

In [None]:
!pip install -r requirements.txt

In [None]:
# Imports
from asr.streaming_whisper_asr import StreamingWhisperASR
from nmt.incremental_mt5_translation import IncrementalMT5Translator
from tts.streaming_tts import StreamingTTS
from utils.audio_io import AudioStreamHandler, play_audio
from utils.japanese_utils import tokenize_japanese, detect_honorific
import time, tempfile, soundfile as sf


In [None]:
# Initialize ASR, NMT, TTS
asr = StreamingWhisperASR(model_size="medium")
translator = IncrementalMT5Translator()
tts = StreamingTTS()
audio_handler = AudioStreamHandler()


In [None]:
# Real-time streaming loop
from contextlib import contextmanager
import numpy as np

@contextmanager
def mic_stream():
    stream = audio_handler.start_input_stream()
    stream.start()
    try:
        yield
    finally:
        stream.stop()
        stream.close()

print("🎙️ Speak in Japanese — Streaming will begin...")

results = []
with mic_stream():
    for _ in range(5):  # process 5 chunks (~1.5 seconds total)
        audio_chunk = audio_handler.get_audio_chunk().flatten()
        audio_chunk_fp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
        sf.write(audio_chunk_fp, audio_chunk, audio_handler.samplerate)

        start_time = time.time()
        jp_texts = asr.stream_transcribe(audio_chunk_fp)
        jp_text = " ".join(jp_texts)
        print("📝 JP:", jp_text)

        # Honorific register detection
        register = detect_honorific(jp_text)
        print("📛 Register:", register)

        # Translate
        en_text = translator.translate_incremental(jp_text)
        print("🌐 EN:", en_text)

        # TTS
        out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
        tts.stream_synthesize(en_text, out_path)
        play_audio(out_path)

        end_time = time.time()
        latency = round((end_time - start_time) * 1000, 2)
        print(f"⚡ Latency: {latency} ms\n")
        results.append({
            "jp": jp_text, "en": en_text, "latency_ms": latency, "register": register
        })

print("✅ Done.")


In [None]:
# ⏱️ Metrics Summary
from statistics import mean
latencies = [r['latency_ms'] for r in results]
print(f"🔁 Average Latency: {mean(latencies):.2f} ms")
print("Translation Outputs:")
for r in results:
    print(f"🈶 JP: {r['jp']} → 🗣 EN: {r['en']} ({r['latency_ms']} ms)")


In [None]:
# [Optional] Translation Metric Evaluation — Add Ref/Hyp Pairs if Available
# from evaluate import load
# metric = load("bleu")
# ref = ["Hello everyone"]
# hyp = ["Hi all"]
# results = metric.compute(predictions=hyp, references=[[r] for r in ref])
# print("BLEU:", results['bleu'])


## 🔄 Future: Direct S2ST with Translatotron 2
- ESPnet or Google Research repo
- Requires pre-trained speech-to-speech model
- Integration work planned for Phase 3
