4211   4.2.1_L’utilisation des différentes fonctionnalités - 1

In [1]:
import os
import re
import ffmpeg
import pysrt
import time
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video       = "to translate/4.2.1_L’utilisation des différentes fonctionnalités - 1.mp4"
base_name         = os.path.splitext(os.path.basename(input_video))[0]
timestamp         = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir        = f"{base_name}_run_{timestamp}"
model_size        = "small"
update_existing   = True
USE_EDGE_TTS      = True  # still using Edge TTS for re-synthesis/timing

os.makedirs(output_dir, exist_ok=True)
input_name        = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio   = os.path.join(output_dir, f"{input_name}-extracted-audio.wav")
subtitle_file_fr  = os.path.join(output_dir, f"{input_name}-french.srt")
translated_audio  = os.path.join(output_dir, f"{input_name}-french.wav")
output_video      = os.path.join(output_dir, f"{input_name}-french.mp4")
review_file       = os.path.join(output_dir, "translation_review.txt")
debug_log_file    = os.path.join(output_dir, "translation_debug_log.txt")

# ===== Helper: Extract audio =====
def extract_audio():
    (ffmpeg
       .input(input_video)
       .output(extracted_audio, ac=1, ar=16000)
       .overwrite_output()
       .run(capture_stdout=True, capture_stderr=True)
    )
    return extracted_audio

# ===== Helper: Transcribe (assumes French audio) =====
def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    print(f"Detected language: {info.language}")
    return info.language, [
        {"start": s.start, "end": s.end, "text": s.text.strip()}
        for s in segments
    ]

# ===== Helper: SubRip time conversion & file generation =====
def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    h = int(seconds // 3600)
    seconds %= 3600
    m = int(seconds // 60)
    seconds %= 60
    ms = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=h, minutes=m, seconds=int(seconds), milliseconds=ms)

def generate_subtitle_file(segments, out_path):
    subs = pysrt.SubRipFile()
    for i, seg in enumerate(segments, 1):
        subs.append(pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(seg["start"]),
            end=time_to_subrip(seg["end"]),
            text=seg["text"]
        ))
    subs.save(out_path, encoding="utf-8")
    return out_path

# ===== Review-override parsing (no translation fallback) =====
def parse_review_file(path):
    text   = open(path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    hdr = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        m = hdr.search(blk)
        if not m: continue
        start_s, end_s = float(m.group(1)), float(m.group(2))
        ft, vs, pre, post, soffs, eoffs = None, "+0%", 0.0, 0.0, 0, 0
        phrases, inter = [], []
        for line in blk.splitlines():
            line=line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                soffs = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                eoffs = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                inter = [max(0,int(x)) for x in parts.split(",")] if parts else []
            elif line.startswith("- "):
                phrases.append(line[2:].strip())
        segments.append({
            "start_s": start_s, "end_s": end_s,
            "final_translation": ft or "",   # no translation fallback
            "voice_speed": vs,
            "pre_silence": pre,
            "post_silence": post,
            "start_offset_ms": soffs,
            "end_offset_ms": eoffs,
            "phrases": phrases,
            "inter_phrase_silences": inter
        })
    print(f"✅ Parsed {len(segments)} review segments.")
    return segments

# ===== TTS & sync =====
async def robust_synthesize_phrase(phrase, out_mp3, voice="fr-FR-DeniseNeural", rate="+0%", max_retries=5):
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as sess:
                await edge_tts.Communicate(text=phrase, voice=voice, rate=rate).save(out_mp3)
                return
        except Exception as e:
            await asyncio.sleep(2**attempt + random.random())
    raise RuntimeError(f"TTS failed for phrase: {phrase[:30]}…")

def adjust_audio_duration(audio: AudioSegment, target_s: float) -> AudioSegment:
    tgt_ms = int(target_s*1000)
    curr  = len(audio)
    if curr > tgt_ms:   return audio[:tgt_ms]
    if curr < tgt_ms:   return audio + AudioSegment.silent(duration=(tgt_ms-curr))
    return audio

async def async_generate_audio_with_reviewOLD(
    subs_path, out_wav, debug_path, review_path
):
    # 1) Auto-generate a review file (simply echo transcript as final)
    subs = pysrt.open(subs_path)
    with open(review_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Adjust only timing/speed. Keep phrases as-is.\n")
        f.write("----------------------------------------------------------------\n\n")
        for i, sub in enumerate(subs, 1):
            start_s = sub.start.ordinal/1000
            end_s   = sub.end.ordinal/1000
            text    = sub.text.replace("\n"," ")
            f.write(f"Segment {i} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {text}\n")
            f.write(f"**Auto Translated:** {text}\n")
            f.write(f"**Final Translation:** {text}\n")
            f.write("**Voice Speed:** +0%\n")
            f.write("**Pre-Silence:** 0\n")
            f.write("**Post-Silence:** 100\n")
            f.write("**Start-Offset:** 0\n")
            f.write("**End-Offset:** 0\n")
            f.write("**Inter-Phrase-Silence:** \n")
            f.write(f"- {text}\n\n")
            f.write("----------------------------------------------------------------\n\n")
    # 2) Parse overrides, then synthesize each segment
    segments = parse_review_file(review_path)
    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s, end_s = seg["start_s"], seg["end_s"]
        duration_ms    = int((end_s - start_s)*1000)
        text           = seg["final_translation"]
        rate           = seg["voice_speed"]
        pre_ms, post_ms= seg["pre_silence"], seg["post_silence"]
        soffs, eoffs   = seg["start_offset_ms"], seg["end_offset_ms"]

        # synthesize one clip
        tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}.mp3")
        await robust_synthesize_phrase(text, tmp_mp3, rate=rate)
        aud = AudioSegment.from_mp3(tmp_mp3)
        os.remove(tmp_mp3)
        aud = adjust_audio_duration(aud, (duration_ms - pre_ms - post_ms)/1000.0)

        # assemble with silences
        seg_audio = AudioSegment.silent(duration=pre_ms) + aud + AudioSegment.silent(duration=post_ms)
        if eoffs>0: seg_audio += AudioSegment.silent(duration=eoffs)
        elif eoffs<0: seg_audio = seg_audio[:eoffs]

        # position on timeline
        start_ms = int(start_s*1000) + soffs
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        combined = combined[:start_ms] + seg_audio

        debug.append(f"Segment {idx+1}: start={start_ms}ms, dur={len(seg_audio)}ms\n")

    # write out
    with open(debug_path, "w", encoding="utf-8") as df:
        df.write("Debug Log\n\n"); df.writelines(debug)
    combined.export(out_wav, format="wav")
    return out_wav



async def async_generate_audio_with_review(
    subs_path: str,
    out_wav: str,
    debug_path: str,
    review_path: str
):
    """
    1) Generate a review file from the French subtitles.
    2) Pause to allow manual edits of timing, text/phrases, silences, etc.
    3) Parse the (potentially edited) review file.
    4) Synthesize each segment via Edge TTS with the reviewed settings.
    5) Export the combined audio and write a debug log.
    """
    import pysrt
    from pydub import AudioSegment
    import tempfile
    import os
    import random
    import aiohttp
    import edge_tts

    # --- 1) Write out the review file ---
    subs = pysrt.open(subs_path)
    with open(review_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Edit any **Final Translation**, **Voice Speed**, **Pre/Post-Silence**,\n")
        f.write("**Start-Offset**, **End-Offset**, **Inter-Phrase-Silence** as needed.\n")
        f.write("----------------------------------------------------------------\n\n")
        for i, sub in enumerate(subs, 1):
            start_s = sub.start.ordinal / 1000.0
            end_s   = sub.end.ordinal / 1000.0
            text    = sub.text.replace("\n", " ")
            f.write(f"Segment {i} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {text}\n")
            f.write(f"**Auto Translated:** {text}\n")
            f.write(f"**Final Translation:** {text}\n")
            f.write("**Voice Speed:** +0%\n")
            f.write("**Pre-Silence:** 0\n")
            f.write("**Post-Silence:** 100\n")
            f.write("**Start-Offset:** 0\n")
            f.write("**End-Offset:** 0\n")
            f.write("**Inter-Phrase-Silence:** \n")
            f.write(f"- {text}\n\n")
            f.write("----------------------------------------------------------------\n\n")
    print(f"✅ Review file created at: {review_path}")

    # --- 2) Pause for manual edits ---
    input("🖊️  Please open and edit the review file as needed, then press Enter to continue…\n")

    # --- 3) Parse the (possibly edited) review file ---
    segments = parse_review_file(review_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    # Helper to synthesize a single phrase robustly
    async def synthesize(phrase: str, tmp_mp3: str, rate: str):
        max_retries = 5
        for attempt in range(1, max_retries + 1):
            try:
                async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as sess:
                    await edge_tts.Communicate(text=phrase, voice="fr-FR-DeniseNeural", rate=rate).save(tmp_mp3)
                    return
            except Exception:
                await asyncio.sleep(2 ** attempt + random.random())
        raise RuntimeError(f"TTS failed for phrase: {phrase[:30]}…")

    # --- 4) Synthesize each segment & assemble timeline ---
    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        duration_ms = int((end_s - start_s) * 1000)

        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = seg["pre_silence"]
        post_ms = seg["post_silence"]
        soffs   = seg["start_offset_ms"]
        eoffs   = seg["end_offset_ms"]

        # Synthesize one clip for the entire segment text
        tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_seg{idx}.mp3")
        await synthesize(text, tmp_mp3, rate)
        aud = AudioSegment.from_mp3(tmp_mp3)
        os.remove(tmp_mp3)

        # Adjust to fit allocated content time (minus pre/post)
        content_s = (duration_ms - pre_ms - post_ms) / 1000.0
        if content_s > 0:
            aud = adjust_audio_duration(aud, content_s)

        # Build segment audio with pre/post-silence and end-offset
        seg_audio = AudioSegment.silent(duration=pre_ms) + aud + AudioSegment.silent(duration=post_ms)
        if eoffs > 0:
            seg_audio += AudioSegment.silent(duration=eoffs)
        elif eoffs < 0:
            seg_audio = seg_audio[:eoffs]

        # Position on timeline with start-offset
        start_ms = int(start_s * 1000) + soffs
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        combined = combined[:start_ms] + seg_audio

        debug.append(f"Segment {idx+1}: start_at={start_ms}ms, seg_duration={len(seg_audio)}ms\n")

    # --- 5) Export debug log and full WAV ---
    with open(debug_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)

    combined.export(out_wav, format="wav")
    print(f"✅ Generated audio file at: {out_wav}")
    return out_wav





# ===== Merge back into the video =====
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        pad = AudioSegment.silent(duration=(video.duration-audio.duration)*1000)
        tmp = os.path.join(output_dir, "tmp_full.wav")
        (AudioSegment.from_file(translated_audio) + pad).export(tmp, format="wav")
        audio = AudioFileClip(tmp)
    final = video.set_audio(audio)
    final.write_videofile(output_video, codec="libx264", audio_codec="aac",
                          temp_audiofile="tmp-audio.m4a", remove_temp=True, threads=4)
    video.close(); audio.close()

# ===== Main =====
async def async_main():
    print("1) Extracting audio…")
    extract_audio()
    print("2) Transcribing to generate French subtitles…")
    _, segments = transcribe(extracted_audio)
    generate_subtitle_file(segments, subtitle_file_fr)
    print("3) Synthesizing adjusted French audio…")
    await async_generate_audio_with_review(subtitle_file_fr, translated_audio, debug_log_file, review_file)
    print("4) Merging into output video…")
    merge_audio_video()
    print(f"🎉 Done! Output video at: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())


✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
1) Extracting audio…
2) Transcribing to generate French subtitles…
Detected language: fr
3) Synthesizing adjusted French audio…
✅ Parsed 118 review segments.
4) Merging into output video…
Moviepy - Building video 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250523_122935\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4.
MoviePy - Writing audio in tmp-audio.m4a


                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250523_122935\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250523_122935\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4
🎉 Done! Output video at: 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250523_122935\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4


In [1]:
import os
import re
import ffmpeg
import pysrt
import time
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video       = "to translate/4.2.1_L’utilisation des différentes fonctionnalités - 1.mp4"
base_name         = os.path.splitext(os.path.basename(input_video))[0]
timestamp         = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir        = f"{base_name}_run_{timestamp}"
model_size        = "small"
update_existing   = True
USE_EDGE_TTS      = True  # still using Edge TTS for re-synthesis/timing

os.makedirs(output_dir, exist_ok=True)
input_name        = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio   = os.path.join(output_dir, f"{input_name}-extracted-audio.wav")
subtitle_file_fr  = os.path.join(output_dir, f"{input_name}-french.srt")
translated_audio  = os.path.join(output_dir, f"{input_name}-french.wav")
output_video      = os.path.join(output_dir, f"{input_name}-french.mp4")
review_file       = os.path.join(output_dir, "translation_review.txt")
debug_log_file    = os.path.join(output_dir, "translation_debug_log.txt")

# ===== Helper: Extract audio =====
def extract_audio():
    (ffmpeg
       .input(input_video)
       .output(extracted_audio, ac=1, ar=16000)
       .overwrite_output()
       .run(capture_stdout=True, capture_stderr=True)
    )
    return extracted_audio

# ===== Helper: Transcribe (assumes French audio) =====
def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    print(f"Detected language: {info.language}")
    return info.language, [
        {"start": s.start, "end": s.end, "text": s.text.strip()}
        for s in segments
    ]

# ===== Helper: SubRip time conversion & file generation =====
def time_to_subripOLD(seconds: float) -> pysrt.SubRipTime:
    h = int(seconds // 3600)
    seconds %= 3600
    m = int(seconds // 60)
    seconds %= 60
    ms = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=h, minutes=m, seconds=int(seconds), milliseconds=ms)

def generate_subtitle_file(segments, out_path):
    subs = pysrt.SubRipFile()
    for i, seg in enumerate(segments, 1):
        subs.append(pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(seg["start"]),
            end=time_to_subrip(seg["end"]),
            text=seg["text"]
        ))
    subs.save(out_path, encoding="utf-8")
    return out_path

# ===== Review-override parsing (no translation fallback) =====
def parse_review_file(path):
    text   = open(path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    hdr = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        m = hdr.search(blk)
        if not m: continue
        start_s, end_s = float(m.group(1)), float(m.group(2))
        ft, vs, pre, post, soffs, eoffs = None, "+0%", 0.0, 0.0, 0, 0
        phrases, inter = [], []
        for line in blk.splitlines():
            line=line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                soffs = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                eoffs = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                inter = [max(0,int(x)) for x in parts.split(",")] if parts else []
            elif line.startswith("- "):
                phrases.append(line[2:].strip())
        segments.append({
            "start_s": start_s, "end_s": end_s,
            "final_translation": ft or "",   # no translation fallback
            "voice_speed": vs,
            "pre_silence": pre,
            "post_silence": post,
            "start_offset_ms": soffs,
            "end_offset_ms": eoffs,
            "phrases": phrases,
            "inter_phrase_silences": inter
        })
    print(f"✅ Parsed {len(segments)} review segments.")
    return segments

# ===== TTS & sync =====
async def robust_synthesize_phrase(phrase, out_mp3, voice="fr-FR-DeniseNeural", rate="+0%", max_retries=5):
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as sess:
                await edge_tts.Communicate(text=phrase, voice=voice, rate=rate).save(out_mp3)
                return
        except Exception as e:
            await asyncio.sleep(2**attempt + random.random())
    raise RuntimeError(f"TTS failed for phrase: {phrase[:30]}…")

def adjust_audio_duration(audio: AudioSegment, target_s: float) -> AudioSegment:
    tgt_ms = int(target_s*1000)
    curr  = len(audio)
    if curr > tgt_ms:   return audio[:tgt_ms]
    if curr < tgt_ms:   return audio + AudioSegment.silent(duration=(tgt_ms-curr))
    return audio

async def async_generate_audio_with_reviewOLD(
    subs_path, out_wav, debug_path, review_path
):
    # 1) Auto-generate a review file (simply echo transcript as final)
    subs = pysrt.open(subs_path)
    with open(review_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Adjust only timing/speed. Keep phrases as-is.\n")
        f.write("----------------------------------------------------------------\n\n")
        for i, sub in enumerate(subs, 1):
            start_s = sub.start.ordinal/1000
            end_s   = sub.end.ordinal/1000
            text    = sub.text.replace("\n"," ")
            f.write(f"Segment {i} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {text}\n")
            f.write(f"**Auto Translated:** {text}\n")
            f.write(f"**Final Translation:** {text}\n")
            f.write("**Voice Speed:** +0%\n")
            f.write("**Pre-Silence:** 0\n")
            f.write("**Post-Silence:** 100\n")
            f.write("**Start-Offset:** 0\n")
            f.write("**End-Offset:** 0\n")
            f.write("**Inter-Phrase-Silence:** \n")
            f.write(f"- {text}\n\n")
            f.write("----------------------------------------------------------------\n\n")
    # 2) Parse overrides, then synthesize each segment
    segments = parse_review_file(review_path)
    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s, end_s = seg["start_s"], seg["end_s"]
        duration_ms    = int((end_s - start_s)*1000)
        text           = seg["final_translation"]
        rate           = seg["voice_speed"]
        pre_ms, post_ms= seg["pre_silence"], seg["post_silence"]
        soffs, eoffs   = seg["start_offset_ms"], seg["end_offset_ms"]

        # synthesize one clip
        tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}.mp3")
        await robust_synthesize_phrase(text, tmp_mp3, rate=rate)
        aud = AudioSegment.from_mp3(tmp_mp3)
        os.remove(tmp_mp3)
        aud = adjust_audio_duration(aud, (duration_ms - pre_ms - post_ms)/1000.0)

        # assemble with silences
        seg_audio = AudioSegment.silent(duration=pre_ms) + aud + AudioSegment.silent(duration=post_ms)
        if eoffs>0: seg_audio += AudioSegment.silent(duration=eoffs)
        elif eoffs<0: seg_audio = seg_audio[:eoffs]

        # position on timeline
        start_ms = int(start_s*1000) + soffs
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        combined = combined[:start_ms] + seg_audio

        debug.append(f"Segment {idx+1}: start={start_ms}ms, dur={len(seg_audio)}ms\n")

    # write out
    with open(debug_path, "w", encoding="utf-8") as df:
        df.write("Debug Log\n\n"); df.writelines(debug)
    combined.export(out_wav, format="wav")
    return out_wav


async def async_generate_audio_with_reviewEXISTING(
    subs_path: str,
    out_wav: str,
    debug_path: str,
    review_path: str
):
    """
    1) Generate a review file from the French subtitles.
    2) Pause to allow manual edits of timing, text/phrases, silences, etc.
    3) Parse the (potentially edited) review file.
    4) Synthesize each segment via Edge TTS with the reviewed settings.
    5) Export the combined audio and write a debug log.
    """
    import pysrt
    from pydub import AudioSegment
    import tempfile
    import os
    import random
    import aiohttp
    import edge_tts

    # --- 1) Write out the review file ---
    subs = pysrt.open(subs_path)
    with open(review_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Edit any **Final Translation**, **Voice Speed**, **Pre/Post-Silence**,\n")
        f.write("**Start-Offset**, **End-Offset**, **Inter-Phrase-Silence** as needed.\n")
        f.write("----------------------------------------------------------------\n\n")
        for i, sub in enumerate(subs, 1):
            start_s = sub.start.ordinal / 1000.0
            end_s   = sub.end.ordinal / 1000.0
            text    = sub.text.replace("\n", " ")
            f.write(f"Segment {i} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {text}\n")
            f.write(f"**Auto Translated:** {text}\n")
            f.write(f"**Final Translation:** {text}\n")
            f.write("**Voice Speed:** +0%\n")
            f.write("**Pre-Silence:** 0\n")
            f.write("**Post-Silence:** 100\n")
            f.write("**Start-Offset:** 0\n")
            f.write("**End-Offset:** 0\n")
            f.write("**Inter-Phrase-Silence:** \n")
            f.write(f"- {text}\n\n")
            f.write("----------------------------------------------------------------\n\n")
    print(f"✅ Review file created at: {review_path}")

    # --- 2) Pause for manual edits ---
    input("🖊️  Please open and edit the review file as needed, then press Enter to continue…\n")

    # --- 3) Parse the (possibly edited) review file ---
    segments = parse_review_file(review_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    # Helper to synthesize a single phrase robustly
    async def synthesize(phrase: str, tmp_mp3: str, rate: str):
        max_retries = 5
        for attempt in range(1, max_retries + 1):
            try:
                async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as sess:
                    await edge_tts.Communicate(text=phrase, voice="fr-FR-DeniseNeural", rate=rate).save(tmp_mp3)
                    return
            except Exception:
                await asyncio.sleep(2 ** attempt + random.random())
        raise RuntimeError(f"TTS failed for phrase: {phrase[:30]}…")

    # --- 4) Synthesize each segment & assemble timeline ---
    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        duration_ms = int((end_s - start_s) * 1000)

        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = seg["pre_silence"]
        post_ms = seg["post_silence"]
        soffs   = seg["start_offset_ms"]
        eoffs   = seg["end_offset_ms"]

        # Synthesize one clip for the entire segment text
        tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_seg{idx}.mp3")
        await synthesize(text, tmp_mp3, rate)
        aud = AudioSegment.from_mp3(tmp_mp3)
        os.remove(tmp_mp3)

        # Adjust to fit allocated content time (minus pre/post)
        content_s = (duration_ms - pre_ms - post_ms) / 1000.0
        if content_s > 0:
            aud = adjust_audio_duration(aud, content_s)

        # Build segment audio with pre/post-silence and end-offset
        seg_audio = AudioSegment.silent(duration=pre_ms) + aud + AudioSegment.silent(duration=post_ms)
        if eoffs > 0:
            seg_audio += AudioSegment.silent(duration=eoffs)
        elif eoffs < 0:
            seg_audio = seg_audio[:eoffs]

        # Position on timeline with start-offset
        start_ms = int(start_s * 1000) + soffs
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        combined = combined[:start_ms] + seg_audio

        debug.append(f"Segment {idx+1}: start_at={start_ms}ms, seg_duration={len(seg_audio)}ms\n")

    # --- 5) Export debug log and full WAV ---
    with open(debug_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)

    combined.export(out_wav, format="wav")
    print(f"✅ Generated audio file at: {out_wav}")
    return out_wav

async def async_generate_audio_with_reviewandramoavyeo(
    subs_path: str,
    out_wav: str,
    debug_path: str,
    review_path: str
):
    """
    1) Generate a review file from the French subtitles.
    2) Pause to allow manual edits of timing, text/phrases, silences, etc.
    3) Parse the (possibly edited) review file.
    4) Extract the natural pause durations from the original French audio.
    5) Synthesize each segment via Edge TTS, inserting the original pauses.
    6) Export the combined audio and write a debug log.
    """
    import pysrt
    from pydub import AudioSegment
    from pydub.silence import detect_silence, detect_nonsilent
    import tempfile
    import os
    import random
    import aiohttp
    import edge_tts

    # --- Load original French audio once ---
    # 'extracted_audio' should be the path to your original French WAV
    orig_audio = AudioSegment.from_file(extracted_audio, format="wav")

    # --- 1) Write out the review file ---
    subs = pysrt.open(subs_path)
    with open(review_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Edit any **Final Translation**, **Voice Speed**, **Pre/Post-Silence**,\n")
        f.write("**Start-Offset**, **End-Offset**, **Inter-Phrase-Silence** as needed.\n")
        f.write("----------------------------------------------------------------\n\n")
        for i, sub in enumerate(subs, 1):
            start_s = sub.start.ordinal / 1000.0
            end_s   = sub.end.ordinal / 1000.0
            text    = sub.text.replace("\n", " ")
            f.write(f"Segment {i} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {text}\n")
            f.write(f"**Auto Translated:** {text}\n")
            f.write(f"**Final Translation:** {text}\n")
            f.write("**Voice Speed:** +0%\n")
            f.write("**Pre-Silence:** 0\n")
            f.write("**Post-Silence:** 100\n")
            f.write("**Start-Offset:** 0\n")
            f.write("**End-Offset:** 0\n")
            f.write("**Inter-Phrase-Silence:** \n")
            f.write(f"- {text}\n\n")
            f.write("----------------------------------------------------------------\n\n")
    print(f"✅ Review file created at: {review_path}")

    # --- 2) Pause for manual edits ---
    input("🖊️  Please open and edit the review file as needed, then press Enter to continue…\n")

    # --- 3) Parse the (possibly edited) review file ---
    segments = parse_review_file(review_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    # Helper to synthesize a single phrase robustly
    async def synthesize(phrase: str, tmp_mp3: str, rate: str):
        max_retries = 5
        for attempt in range(1, max_retries + 1):
            try:
                async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)):
                    await edge_tts.Communicate(
                        text=phrase,
                        voice="fr-FR-DeniseNeural",
                        rate=rate
                    ).save(tmp_mp3)
                    return
            except Exception:
                await asyncio.sleep(2 ** attempt + random.random())
        raise RuntimeError(f"TTS failed for phrase: {phrase[:30]}…")

    # --- 4) Extract natural pauses & synthesize each segment ---
    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        duration_ms = int((end_s - start_s) * 1000)

        # 4a) Extract the matching slice from the original
        slice_start = int(start_s * 1000)
        slice_end   = int(end_s   * 1000)
        orig_slice  = orig_audio[slice_start:slice_end]

        # 4b) Detect silent intervals (natural pauses)
        raw_silences = detect_silence(
            orig_slice,
            min_silence_len=50,
            silence_thresh=orig_slice.dBFS - 16
        )
        # Convert to durations
        seg["inter_phrase_silences"] = [
            end - start for start, end in raw_silences
        ]

        # 4c) Prepare TTS settings
        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = seg["pre_silence"]
        post_ms = seg["post_silence"]
        soffs   = seg["start_offset_ms"]
        eoffs   = seg["end_offset_ms"]

        # 4d) Synthesize the full-segment text
        tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_seg{idx}.mp3")
        await synthesize(text, tmp_mp3, rate)
        aud = AudioSegment.from_mp3(tmp_mp3)
        os.remove(tmp_mp3)

        # 4e) Trim any TTS trailing silence
        nons = detect_nonsilent(aud, min_silence_len=50, silence_thresh=aud.dBFS - 16)
        if nons:
            aud = aud[nons[0][0]:nons[-1][1]]

        # 4f) Adjust to fit allocated speech time
        speech_time_ms = duration_ms - pre_ms - post_ms
        if speech_time_ms > 0:
            aud = adjust_audio_duration(aud, speech_time_ms / 1000.0)

        # 4g) Assemble segment audio with original pauses
        seg_audio = AudioSegment.silent(duration=pre_ms)
        phrases = [text]  # using full text; if you split into sub-phrases, loop here
        # here we treat the whole text as one phrase; for multiple phrases, you'd split and loop:
        seg_audio += aud
        for pause in seg["inter_phrase_silences"]:
            seg_audio += AudioSegment.silent(duration=pause)
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Apply end-offset
        if eoffs > 0:
            seg_audio += AudioSegment.silent(duration=eoffs)
        elif eoffs < 0:
            seg_audio = seg_audio[:eoffs]

        # 4h) Position on the combined timeline
        start_ms = int(start_s * 1000) + soffs
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        combined = combined[:start_ms] + seg_audio

        debug.append(
            f"Segment {idx+1}: start_at={start_ms}ms, duration={len(seg_audio)}ms, "
            f"pauses={seg['inter_phrase_silences']}\n"
        )

    # --- 5) Export debug log and final WAV ---
    with open(debug_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)

    combined.export(out_wav, format="wav")
    print(f"✅ Generated audio file at: {out_wav}")
    return out_wav


# ====

import re
import pysrt
from pydub import AudioSegment
from pydub.silence import detect_silence, detect_nonsilent
import tempfile
import os
import random
import aiohttp
import edge_tts
import asyncio

# ——— Helper: convert seconds → SubRipTime —————————————————————————————
def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes,
                            seconds=int(seconds), milliseconds=milliseconds)

# ——— Helper: split Whisper segments into safe-duration groups ————————
def split_long_groups(groups, max_group_duration_secs):
    new = []
    for group in groups:
        start_s = group[0].start
        end_s   = group[-1].end
        total   = end_s - start_s
        if total <= max_group_duration_secs:
            new.append(group)
            continue

        temp = []
        temp_start = start_s
        last_safe = None
        for idx, item in enumerate(group):
            temp.append(item)
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe = idx
            current_end = item.end
            if (current_end - temp_start) >= max_group_duration_secs:
                if last_safe is not None:
                    new.append(temp[: last_safe+1])
                    temp = temp[last_safe+1 :]
                else:
                    new.append(temp)
                    temp = []
                temp_start = temp[0].start if temp else current_end
                last_safe = None

        if temp:
            new.append(temp)
    return new

# ——— Helper: ensure groups end in punctuation ——————————————————
def enforce_punctuation_boundaries(groups):
    i = 0
    safe_p = re.compile(r"[.!?,;:]$")
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not safe_p.search(last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1
    return groups

# ——— Updated async_generate_audio_with_review ———————————————————————
async def async_generate_audio_with_review(
    subtitle_path: str,
    out_wav: str,
    debug_path: str,
    review_path: str,
    max_group_duration: float = 25.0
):
    """
    1) Re-run Whisper on the already-extracted French audio.
    2) Group into sentence-based segments (max duration X s).
    3) Write & pause for manual review (review_path).
    4) Parse review file.
    5) Extract original pauses from global 'extracted_audio'.
    6) TTS synth per segment with DeniseNeural + original pauses.
    7) Export combined WAV to out_wav and debug log to debug_path.
    """
    import re
    import pysrt
    from faster_whisper import WhisperModel
    from pydub import AudioSegment
    from pydub.silence import detect_silence, detect_nonsilent
    import aiohttp, edge_tts, tempfile, os, random

    # 1) Transcribe original French audio again
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    raw_segments, _ = model.transcribe(extracted_audio, beam_size=5)

    # Wrap for grouping
    class Seg:
        def __init__(self, start, end, text):
            self.start = start
            self.end   = end
            self.text  = text.strip()

    pys = [Seg(s.start, s.end, s.text) for s in raw_segments]

    # 2) Group into sentence chunks
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, cur = [], []
    for s in pys:
        cur.append(s)
        if sentence_end.search(s.text):
            groups.append(cur); cur = []
    if cur: groups.append(cur)

    # split & enforce
    groups = split_long_groups(groups, max_group_duration)
    groups = enforce_punctuation_boundaries(groups)

    # 3) Write review file for manual edits
    with open(review_path, "w", encoding="utf-8") as f:
        f.write("Review File — ajustez textes, silences, offsets…\n")
        f.write("------------------------------------------------\n\n")
        for i, group in enumerate(groups, 1):
            start_s = group[0].start
            end_s   = group[-1].end
            original = " ".join(g.text for g in group)
            f.write(f"Segment {i} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Final Translation:** {original}\n")
            f.write("**Voice Speed:** +0%\n")
            f.write("**Pre-Silence:** 0\n")
            f.write("**Post-Silence:** 100\n")
            f.write("**Start-Offset:** 0\n")
            f.write("**End-Offset:** 0\n")
            f.write("**Inter-Phrase-Silence:** \n")
            # default phrase list
            phrases = re.split(r"(?<=[.!?])\s+", original)
            for ph in phrases:
                f.write(f"- {ph.strip()}\n")
            f.write("\n------------------------------------------------\n\n")
    print(f"✅ Review file at: {review_path}")
    input("🖊️ Éditez la review file puis appuyez sur Entrée pour continuer…")

    # 4) Parse edited review
    segments = parse_review_file(review_path)

    # 5) Load original for pause detection
    orig_audio = AudioSegment.from_file(extracted_audio, format="wav")
    combined   = AudioSegment.silent(duration=0)
    debug      = []

    # synth helper
    async def synth(phrase, tmp, rate):
        for _ in range(5):
            try:
                async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)):
                    await edge_tts.Communicate(text=phrase,
                                               voice="fr-FR-DeniseNeural",
                                               rate=rate).save(tmp)
                    return
            except:
                await asyncio.sleep(0.5 + random.random())
        raise RuntimeError("TTS failed")

    # 6) Build each segment
    for idx, seg in enumerate(segments):
        s_s, e_s = seg["start_s"], seg["end_s"]
        dur_ms    = int((e_s - s_s) * 1000)
        slice_ms  = (int(s_s*1000), int(e_s*1000))
        orig_slice = orig_audio[slice_ms[0]:slice_ms[1]]

        # natural pauses
        raw_sil = detect_silence(orig_slice,
                                 min_silence_len=50,
                                 silence_thresh=orig_slice.dBFS - 16)
        seg["inter_phrase_silences"] = [e - s for s, e in raw_sil]

        # synth
        tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp{idx}.mp3")
        await synth(seg["final_translation"], tmp_mp3, seg["voice_speed"])
        aud = AudioSegment.from_mp3(tmp_mp3); os.remove(tmp_mp3)

        # trim
        nons = detect_nonsilent(aud, min_silence_len=50, silence_thresh=aud.dBFS - 16)
        if nons:
            aud = aud[nons[0][0]:nons[-1][1]]

        # fit speech window
        speech_ms = max(0, dur_ms - seg["pre_silence"] - seg["post_silence"])
        aud = adjust_audio_duration(aud, speech_ms/1000.0)

        # assemble with pauses
        seg_a = AudioSegment.silent(duration=seg["pre_silence"]) + aud
        for p in seg["inter_phrase_silences"]:
            seg_a += AudioSegment.silent(duration=p)
        seg_a += AudioSegment.silent(duration=seg["post_silence"])

        # end-offset
        eo = seg["end_offset_ms"]
        seg_a = seg_a if eo>=0 else seg_a[:eo]
        if eo>0: seg_a += AudioSegment.silent(duration=eo)

        # place on timeline
        start_ms = int(s_s*1000) + seg["start_offset_ms"]
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        combined = combined[:start_ms] + seg_a

        debug.append(f"Seg {idx+1}: start={start_ms}ms, dur={len(seg_a)}ms, pauses={seg['inter_phrase_silences']}\n")

    # 7) Export
    with open(debug_path, "w", encoding="utf-8") as df:
        df.write("Debug Log\n\n"); df.writelines(debug)
    combined.export(out_wav, format="wav")
    print(f"✅ Generated audio at: {out_wav}")
    return out_wav

# ====


# ===== Merge back into the video =====
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        pad = AudioSegment.silent(duration=(video.duration-audio.duration)*1000)
        tmp = os.path.join(output_dir, "tmp_full.wav")
        (AudioSegment.from_file(translated_audio) + pad).export(tmp, format="wav")
        audio = AudioFileClip(tmp)
    final = video.set_audio(audio)
    final.write_videofile(output_video, codec="libx264", audio_codec="aac",
                          temp_audiofile="tmp-audio.m4a", remove_temp=True, threads=4)
    video.close(); audio.close()

# ===== Main =====
async def async_main():
    print("1) Extracting audio…")
    extract_audio()
    print("2) Transcribing to generate French subtitles…")
    _, segments = transcribe(extracted_audio)
    generate_subtitle_file(segments, subtitle_file_fr)
    print("3) Synthesizing adjusted French audio…")
    await async_generate_audio_with_review(subtitle_file_fr, translated_audio, debug_log_file, review_file)
    print("4) Merging into output video…")
    merge_audio_video()
    print(f"🎉 Done! Output video at: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())


✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
1) Extracting audio…
2) Transcribing to generate French subtitles…
Detected language: fr
3) Synthesizing adjusted French audio…
✅ Review file at: 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250524_014131\translation_review.txt
✅ Parsed 78 review segments.
✅ Generated audio at: 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250524_014131\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.wav
4) Merging into output video…
Moviepy - Building video 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250524_014131\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4.
MoviePy - Writing audio in tmp-audio.m4a


                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250524_014131\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250524_014131\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4
🎉 Done! Output video at: 4.2.1_L’utilisation des différentes fonctionnalités - 1_run_20250524_014131\4.2.1_L’utilisation des différentes fonctionnalités - 1-french.mp4


LAST LAST VERSION 424

In [None]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent
from pydub.silence import detect_silence

# (Assurez-vous que robust_synthesize_phrase est importé et synchrone ici,
#  ou adaptez-le en appelant un TTS synchrone de votre choix.)

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such “safe” break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break—just split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up


def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation—will use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe précisément à la durée allouée
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on complète par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si décal_end est négatif de D ms, on ajoute D ms à post_silence
      - Si décal_start est positif de D ms, on ajoute D ms à pre_silence
    On réécrit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*décal_start=(-?\d+)ms, décal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en mémoire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne réduit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) Réécriture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"✅ Review file ajusté selon {debug_log_path}")



def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) On regroupe et on split/merge les sous-titres exactement
       comme le fera l'audio.
    2) On écrit un review file où l'on affiche :
       - phrase par phrase (la liste exacte via "- ")
       - pre / post silence
       - voice speed
       - start/end offset
       - inter-phrase silences (N–1 valeurs pour N phrases)
       - measured-inter-phrase silences (durées réelles TTS)
    """
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regrouper par phrase (détection ponctuation en fin de sous-titre)
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text):
            groups.append(cur); cur = []
    if cur:
        groups.append(cur)

    # 2) Éclatement des groupes trop longs
    def split_long(gs, max_s):
        out = []
        for g in gs:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end - start <= max_s:
                out.append(g)
            else:
                mid = len(g)//2
                out.extend([g[:mid], g[mid:]])
        return out
    groups = split_long(groups, max_group_duration_secs)

    # 3) Forcer ponctuation de fin de groupe
    i = 0
    safe_punct = re.compile(r"[.!?,;:]$")
    while i < len(groups):
        if not safe_punct.search(groups[i][-1].text.strip()):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 4) Écriture du fichier de review
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le découpage en phrases ci-dessous est **celui utilisé** en TTS.\n")
        f.write("Ajustez si besoin **Final Translation**, **Voice Speed**, **Pre/Post-Silence**, "
                "**Start-Offset:**, **End-Offset:**, **Inter-Phrase-Silence:**\n")
        f.write("Le champ **Measured-Inter-Phrase-Silence:** indique la pause réelle TTS détectée.\n")
        f.write("mais **ne touchez pas** la liste des phrases (lignes qui commencent par '- ').\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            start_s = group[0].start.ordinal / 1000
            end_s   = group[-1].end.ordinal   / 1000

            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # découpage phrases
            phrases = re.split(r"(?<=[.!?])\s+(?=[A-ZÀÂÉÈÊËÎÏÔŒÙÛÜ])", auto_tr)
            phrases = [p.strip() for p in phrases if p.strip()]

            # silences internes par défaut
            n = len(phrases)
            inter_silences = ",".join("0" for _ in range(max(0, n-1)))

            # valeurs par défaut
            pre_ms, post_ms = 0, 100
            start_offset, end_offset = 0, 0
            voice_speed = "+0%"

            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** {voice_speed}\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Start-Offset:** {start_offset}\n")
            f.write(f"**End-Offset:** {end_offset}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_silences}\n")
            for ph in phrases:
                f.write(f"- {ph}\n")

            # ——— Mesure des silences “naturels” entre phrases ———
            try:
                # 1) Synthèse rapide du segment complet
                tmp_all = os.path.join(tempfile.gettempdir(), f"tts_segment_{idx}.mp3")
                asyncio.run(robust_synthesize_phrase(auto_tr, tmp_all, rate=voice_speed))
                raw = AudioSegment.from_mp3(tmp_all)
                os.remove(tmp_all)

                # 2) Détection des silences ≥ 50 ms (on capte tous)
                raw_sil = detect_silence(
                    raw,
                    min_silence_len=200,
                    silence_thresh=raw.dBFS - 16
                )
                # 3) On ne garde que les N–1 plus longues, dans l’ordre chronologique
                #    pour correspondre aux N phrases
                n_breaks = max(0, len(phrases) - 1)
                # calculer (durée, position) pour chaque pause
                durs_pos = [(end - start, start) for start, end in raw_sil]
                # sélectionner les N–1 plus longues
                longest = sorted(durs_pos, key=lambda x: x[0], reverse=True)[:n_breaks]
                # remettre dans l’ordre temporel
                longest.sort(key=lambda x: x[1])
                measured = [dur for dur, _ in longest]

                f.write(f"**Measured-Inter-Phrase-Silence:** {','.join(str(x) for x in measured)}\n")
            except Exception:
                f.write(f"**Measured-Inter-Phrase-Silence:** \n")
            f.write("\n----------------------------------------------------------------\n\n")

    print(f"✅ Review file créé : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer…")



# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}…'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}…'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s…")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}…")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilité, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars défini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt


def parse_review_file(review_file_path):
    """
    Lit le review file et renvoie une liste de dicts avec :
      - start_s, end_s, original, final_translation, voice_speed
      - pre_silence, post_silence, start_offset_ms, end_offset_ms
      - phrases (list de phrases) et inter_phrase_silences (liste de silences internes)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")

    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): 
            continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        # valeurs par défaut
        ft, vs = None, "+0%"
        pre, post = 0.0, 0.0
        soffs, eoffs = 0, 0
        phrases = []
        inter = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                soffs = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                eoffs = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if parts:
                    inter = [max(0, int(x)) for x in parts.split(",")]
            elif line.startswith("- "):
                phrases.append(line[2:].strip())

        segments.append({
            "start_s": start_s,
            "end_s": end_s,
            "final_translation": ft or "",
            "voice_speed": vs,
            "pre_silence": pre,
            "post_silence": post,
            "start_offset_ms": soffs,
            "end_offset_ms": eoffs,
            "phrases": phrases,
            "inter_phrase_silences": inter
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments


async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # 1) (Re)génération systématique du review file
    generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        max_group_duration_secs=25.0
    )
    print("✅ Review file régénéré, pensez à y reporter vos offsets personnalisés si besoin.")

    # 2) Lecture du review file enrichi
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        # Récupération des settings
        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = int(seg["pre_silence"])
        post_ms = int(seg["post_silence"])
        soff    = seg["start_offset_ms"]
        eoff    = seg["end_offset_ms"]

        # Phrase splitting & TTS
        # phrases = split_french_phrases(text)
        # weights = calculate_phrase_weights(text, phrases)
        # phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)
        
        
        
                # Phrase splitting & TTS
        if idx == 9:
            # Segment 10 : split manuel pour bien isoler “Nous devons simplement…”
            phrases = [
                "Nous allons maintenant passer en revue la sécurité au niveau des données.",
                "Les intersections valides vous permettent de contrôler la saisie en définissant des combinaisons valides de membres.",
                "Nous devons simplement définir ces combinaisons et les enregistrer.",
                "Nous disposons aussi d’une sécurité au niveau cellulaire, qui gère l’accès à chaque cellule selon les dimensions et sélections de membres.",
                "Pour créer et configurer un formulaire de données depuis la page d’accueil, cliquez sur 'Données'.",
                "Sous Actions, choisissez Créer un formulaire ad hoc ou Créer un formulaire.",
                "Je clique sur Créer un formulaire, puis j’entre un nom et sélectionne le cube.",
                "Ensuite, je glisse les dimensions sur les lignes et colonnes, ou sur les pages selon le cas.",
                "J’ajoute les membres requis via l’icône du sélecteur.",
                "Enfin, je peux personnaliser l’affichage et ajouter des menus si nécessaire."
            ]
            # égaliser les poids si on veut un partage uniforme
            weights = [1/len(phrases)] * len(phrases)
            # et forcer toutes les pauses internes à zéro
            seg["inter_phrase_silences"] = [0] * (len(phrases) - 1)
        else:
            phrases = split_french_phrases(text)
            weights = calculate_phrase_weights(text, phrases)
            phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)

        # Budget pour la synthèse seule
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # Synthèse phrase par phrase
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s   = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await robust_synthesize_phrase(ph, tmp_mp3, rate=rate)
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)
            aud = adjust_audio_duration(aud, dur_s)
            
            
            # # —————> Trim uniquement pour le segment 10 (idx==9)
            # if idx == 9:
            #     # détecter les portions non silencieuses (min_silence_len=50ms)
            #     nons = detect_nonsilent(aud, min_silence_len=50, silence_thresh=aud.dBFS - 16)
            #     if nons:
            #         start_trim, end_trim = nons[0][0], nons[-1][1]
            #         aud = aud[start_trim:end_trim]
            
            
            # === debug additionnel pour seg 10 ===
            if idx == 9:
                debug.append(
                    f"  Phrase {i+1}/{len(phrases)}: «{ph[:30]}…» "
                    f"cible={dur_s*1000:.0f}ms, réel={len(aud)}ms\n"
                )                       
            
            phrase_audios.append(aud)

        # Inter-phrase silences
        n_inter = max(0, len(phrases) - 1)
        if seg.get("inter_phrase_silences"):
            inter_applied = seg["inter_phrase_silences"][:n_inter]
            inter_applied += [0] * (n_inter - len(inter_applied))
        else:
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            sil_ms = (available // n_inter) if (n_inter and available>0) else 0
            inter_applied = [sil_ms] * n_inter

        # Reconstruction du segment
        seg_audio = AudioSegment.silent(duration=pre_ms)
        for i, aud in enumerate(phrase_audios):
            seg_audio += aud
            if i < len(inter_applied):
                seg_audio += AudioSegment.silent(duration=inter_applied[i])
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Offset de fin
        if eoff > 0:
            seg_audio += AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]

        # Mesure du décalage pour le debug
        nons2 = detect_nonsilent(seg_audio,
                                 min_silence_len=50,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (len(seg_audio)-post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000) + soff
        abs_e_v = int(end_s   * 1000) + eoff
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v


        if idx == 9:
            # log des silences insérés
            for j, sil in enumerate(inter_applied):
                debug.append(f"  Silence {j+1}/{len(inter_applied)}: {sil}ms\n")
            debug.append(f"  Segment10 total length: {len(seg_audio)}ms (should be {total_ms}ms)\n")


        # Placement sur la timeline
        start_ms = int(start_s * 1000) + soff
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=start_ms - len(combined))
        elif len(combined) > start_ms and soff < 0:
            combined = combined[:start_ms]
        combined += seg_audio

        # Enregistrement pour debug
        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"inter={inter_applied}, "
            f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
        )

    # Export debug & wav
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path


async def async_generate_translated_audio_with_sync_using_reviewLASTMAISPASOK(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # 1) Génération du review file si pas existant ou si on veut l'écraser
    if not os.path.exists(review_file_path) or update_existing:
        generate_translation_review_file(
            subtitle_source_path,
            review_file_path,
            max_group_duration_secs=25.0
        )
        print("✅ Review file créé ou mis à jour.")
    else:
        print("ℹ️ Review file existant conservé (update_existing=False).")

    # 2) Lecture du review file enrichi
    segments = parse_review_file(review_file_path)

    # 3) Override post-parse pour le segment 10 (idx 9)
    if len(segments) > 9:
        segments[9]["start_offset_ms"] = segments[9].get("start_offset_ms", -6500)
        print(f"⚙️ Segment 10 : start_offset_ms fixé à {segments[9]['start_offset_ms']}ms")

    combined = AudioSegment.silent(duration=0)
    debug    = []

    # 4) Boucle de génération audio
    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        # Réglages utilisateur
        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = int(seg["pre_silence"])
        post_ms = int(seg["post_silence"])
        soff    = seg["start_offset_ms"]
        eoff    = seg["end_offset_ms"]

        # Phrase splitting & TTS
        if idx == 9:
            phrases = [
                "Nous allons maintenant regarder la sécurité au niveau des données.",
                "Les intersections valides peuvent vous aider à contrôler la saisie des données en définissant des combinaisons valides de membres de la dimension, pour éviter les entrées de données non valides.",
                "Nous devons simplement définir les combinaisons et l'enregistrer.",
                "Nous avons également une sécurité au niveau des cellules, qui contrôle l'accès aux cellules individuelles en fonction des dimensions et des sélections de membres.",
                "Nous pouvons créer une nouvelle sécurité ou faire une importation.",
                "Voyons comment créer et configurer un formulaire de données.",
                "À partir de la page d'accueil, cliquez sur la carte Données.",
                "Sous actions, nous pouvons créer un ad hoc ou créer un formulaire.",
                "Je vais cliquer sur Créer un formulaire.",
                "Nous devons entrer le nom.",
                "Nous devrons sélectionner le cube.",
                "Nous aurons toutes les dimensions ici.",
                "Nous devons faire glisser les dimensions sur les lignes et colonnes respectives.",
                "Nous pouvons également avoir des dimensions sur les pages.",
                "Nous devrons sélectionner les membres en conséquence.",
                "Cliquer sur l'icône me donnera le sélecteur de membres et je peux ajouter le membre requis.",
                "J'ai la possibilité de personnaliser en fonction de la façon dont je veux voir les données si j'ai besoin de menus."
            ]
            weights = [1/len(phrases)] * len(phrases)
            seg["inter_phrase_silences"] = seg.get(
                "inter_phrase_silences",
                [0] * (len(phrases) - 1)
            )
        else:
            phrases = split_french_phrases(text)
            weights = calculate_phrase_weights(text, phrases)
            phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)

        # Budget TTS
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # Synthèse phrase par phrase
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s   = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await robust_synthesize_phrase(ph, tmp_mp3, rate=rate)
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)
            aud = adjust_audio_duration(aud, dur_s)

            if idx == 9:
                debug.append(
                    f"  Phrase {i+1}/{len(phrases)}: «{ph[:30]}…» "
                    f"cible={dur_s*1000:.0f}ms, réel={len(aud)}ms\n"
                )
            phrase_audios.append(aud)

        # Silences inter-phrase
        n_inter = max(0, len(phrases) - 1)
        if seg.get("inter_phrase_silences"):
            inter_applied = seg["inter_phrase_silences"][:n_inter]
            inter_applied += [0] * (n_inter - len(inter_applied))
        else:
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            sil_ms = (available // n_inter) if (n_inter and available > 0) else 0
            inter_applied = [sil_ms] * n_inter

        # Reconstruction du segment
        seg_audio = AudioSegment.silent(duration=pre_ms)
        for i, aud in enumerate(phrase_audios):
            seg_audio += aud
            if i < len(inter_applied):
                seg_audio += AudioSegment.silent(duration=inter_applied[i])
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Offset de fin
        if eoff > 0:
            seg_audio += AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]

        # Mesure du décalage pour le debug
        nons2 = detect_nonsilent(
            seg_audio,
            min_silence_len=50,
            silence_thresh=seg_audio.dBFS - 16
        )
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (len(seg_audio) - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000) + soff
        abs_e_v = int(end_s   * 1000) + eoff
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        if idx == 9:
            for j, sil in enumerate(inter_applied):
                debug.append(f"  Silence {j+1}/{len(inter_applied)}: {sil}ms\n")
            debug.append(f"  Segment10 total length: {len(seg_audio)}ms (should be {total_ms}ms)\n")

        # Placement sur la timeline (avec start_offset corrigé)
        start_ms = int(start_s * 1000) + soff
        gap = start_ms - len(combined)
        combined += AudioSegment.silent(duration=max(0, gap))
        combined += seg_audio
        

        # Enregistrement debug global
        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"inter={inter_applied}, "
            f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
        )

    # 5) Export debug & wav
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path

# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
[Debug] Attempt 1/10: Synthesizing phrase: 'Je vais jeter un œil aux confi…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_1.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Actuellement, je vais ouvrir u…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_2.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'J'ai déjà un formulaire ici et…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_3.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous pouvons faire lancer ce l…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_4.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Vous voyez comment les données…'
[Debug] Phrase

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250512_222557\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250512_222557\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250512_222557\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4


In [None]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent
import spacy


nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such “safe” break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break—just split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up



def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "  0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation—will use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i  1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe précisément à la durée allouée
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on complète par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si décal_end est négatif de D ms, on ajoute D ms à post_silence
      - Si décal_start est positif de D ms, on ajoute D ms à pre_silence
    On réécrit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*décal_start=(-?\d+)ms, décal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en mémoire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne réduit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) Réécriture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"✅ Review file ajusté selon {debug_log_path}")





def generate_review_from_srt_grouped(
    source_srt, review_file_path,
    from_lang="en", to_lang="fr"
):
    import pysrt, re
    from deep_translator import GoogleTranslator
    import spacy

    subs = pysrt.open(source_srt, encoding='utf-8')
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    nlp = spacy.load("fr_core_news_sm")

    sentence_end = re.compile(r"[.!?]\s*$")

    buffer_text = []
    buffer_start = None
    buffer_end   = None
    segments = []

    for sub in subs:
        # init buffer start time
        if buffer_start is None:
            buffer_start = sub.start.ordinal / 1000
        # always extend buffer end time
        buffer_end = sub.end.ordinal / 1000
        buffer_text.append(sub.text.replace("\n", " ").strip())

        joined = " ".join(buffer_text)
        # si on termine sur un .,? ou !
        if sentence_end.search(joined):
            segments.append((buffer_start, buffer_end, joined))
            buffer_text = []
            buffer_start = None
            buffer_end   = None

    # s’il reste du texte sans ponctuation finale, on le traite aussi
    if buffer_text:
        joined = " ".join(buffer_text)
        segments.append((buffer_start, buffer_end, joined))

    # écriture du review-file
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File (groupé par phrases complètes)\n")
        f.write("Chaque segment s’arrête sur ., ? ou !\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, (start_s, end_s, original_en) in enumerate(segments, 1):
            try:
                auto_tr = translator.translate(text=original_en)
            except:
                auto_tr = "[ERREUR DE TRADUCTION]"

            doc = nlp(auto_tr)
            phrases = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
            inter_ms = [0]*max(0, len(phrases)-1)
            budget   = int((end_s - start_s)*1000)

            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original (EN):** {original_en}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            # f.write("**Phrases (ne modifiez pas) :**\n")
            # for ph in phrases:
            #     f.write(f"- {ph}\n")
            f.write("**Pre-Silence:** 0\n")
            f.write("**Post-Silence:** 100\n")
            f.write(f"**Inter-Phrase-Silence:** {','.join(str(x) for x in inter_ms)}\n")
            f.write("**Start-Offset:** 0\n")
            f.write("**End-Offset:** 0\n")
            f.write(f"**Budget (ms):** {budget}\n")
            f.write("----------------------------------------------------------------\n\n")

    print(f"✅ Review file généré : {review_file_path} ({len(segments)} segments)")
    input("⚠️ Relisez et corrigez le fichier puis appuyez sur Entrée…")



def parse_review_file(review_file_path):
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        if blk.startswith("Translation Review File"): continue
        m = header.search(blk)
        if not m: continue
        start_s, end_s = float(m.group(1)), float(m.group(2))
        # defaults
        ft = vs = None
        pre = post = start_offset = end_offset = 0.0
        phrases = []
        silence_avant = []
        # 1) première passe : Original, Final, Phrases, Pre/Post, Offsets
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Phrases"):
                continue  # skip header
            elif line.startswith("- ") and not silence_avant:
                phrases.append(line[2:].strip())
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                start_offset = float(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = float(line.split("**End-Offset:**",1)[1])
        # init silence_avant à zéro pour chaque phrase
        silence_avant = [0.0] * len(phrases)
        # 2) lecture du bloc *après* avoir capturé les phrases
        state = None
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Silence-Avant:**"):
                state = "silence"
                continue
            if state == "silence" and line.startswith("- "):
                text_ph, ms = line[2:].rsplit(":",1)
                text_ph = text_ph.strip()
                ms      = float(ms.strip())
                if text_ph in phrases:
                    idx = phrases.index(text_ph)
                    silence_avant[idx] = ms
            if line == "":
                state = None

        segments.append({
            "start_s": start_s,
            "end_s":   end_s,
            "final_translation": ft,
            "pre_silence":       pre,
            "post_silence":      post,
            "start_offset_ms":   start_offset,
            "end_offset_ms":     end_offset,
            "phrases":           phrases,
            "silence_avant_ms":  silence_avant
        })
    return segments

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}…'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}…'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s…")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}…")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilité, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars défini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on découpe au premier “,” ou “ et ” qu’on trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

async def async_generate_translated_audio_with_sync_using_review(
    subtitle_file_en,
    output_audio_path,
    debug_log_path,
    review_file_path
):
    # ─── 1) Génération / mise à jour du review-file à partir du .srt ───
    generate_review_from_srt_grouped(
        source_srt       = subtitle_file_en,
        review_file_path = review_file_path
    )

    # ─── 2) Lecture du review-file enrichi ───
    segments = parse_review_file(review_file_path)

    # ─── 2b) Injecter automatiquement les start_offset à partir du end_offset précédent ───
    for i in range(1, len(segments)):
        prev = segments[i-1]
        segments[i]['start_offset_ms'] = - prev.get('end_offset_ms', 0)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        # ─── 3) Récupérations de base ───
        start_s   = seg["start_s"]
        end_s     = seg["end_s"]
        total_ms  = int((end_s - start_s) * 1000)
        text      = seg["final_translation"]
        rate      = seg["voice_speed"]
        pre_ms    = seg["pre_silence"]
        post_ms   = seg["post_silence"]
        soff      = seg.get("start_offset_ms", 0)
        eoff      = seg.get("end_offset_ms",   0)

        # ─── 4) Découpage en phrases FR d’après le texte final ───
        phrases = split_french_phrases(text)
        weights = calculate_phrase_weights(text, phrases)
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40, max_chars=None)

        # ─── 5) Calcul du budget audio hors silences pré/post ───
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # ─── 6) Synthèse phrase par phrase ───
        phrase_audios = []
        for pi, ph in enumerate(phrases):
            dur_s  = (content_ms * weights[pi]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{pi}.mp3")
            await robust_synthesize_phrase(ph, tmp_mp3, voice="fr-FR-DeniseNeural", rate=rate)

            aud = AudioSegment.from_mp3(tmp_mp3)
            try:
                os.remove(tmp_mp3)
            except PermissionError:
                pass  # Windows lock, on nettoiera plus tard
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # ─── 7) Anti-dépassement si TTS trop long ───
        sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)
        if sum_tts > content_ms and sum_tts > 0:
            factor = content_ms / sum_tts
            phrase_audios = [ change_playback_speed(a, factor) for a in phrase_audios ]
            sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)

        # ─── 8) Silences internes ───
        n_inter = max(0, len(phrases) - 1)
        manual_inters = seg.get("silences_internal", [])
        if manual_inters and len(manual_inters) == n_inter:
            inter_applied = manual_inters
        else:
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            if n_inter > 0 and available > 0:
                sil_ms = available // n_inter
                inter_applied = [sil_ms] * n_inter
            else:
                inter_applied = [0] * n_inter

        # ─── 9) Assemblage du segment ───
        seq = []
        for i_aud, aud in enumerate(phrase_audios):
            seq.append(aud)
            if i_aud < len(inter_applied):
                seq.append(AudioSegment.silent(duration=inter_applied[i_aud]))

        seg_audio = AudioSegment.silent(duration=pre_ms)
        for clip in seq:
            seg_audio += clip
        seg_audio += AudioSegment.silent(duration=post_ms)

        # ─── 10) Strip des silences initiaux indésirables ───
        nons = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS - 16)
        if nons:
            seg_audio = seg_audio[nons[0][0]:]
        seg_audio = AudioSegment.silent(duration=pre_ms) + seg_audio

        # ─── 11) Ajuster à la durée totale_ms ───
        if len(seg_audio) < total_ms:
            seg_audio += AudioSegment.silent(duration=(total_ms - len(seg_audio)))
        seg_audio = seg_audio[:total_ms]

        # ─── 12) Debug timings ───
        nons2       = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS - 16)
        start_a     = nons2[0][0] if nons2 else pre_ms
        end_a       = nons2[-1][1] if nons2 else (total_ms - post_ms)
        decal_start = int(start_s*1000 + start_a) - int(start_s*1000)
        decal_end   = int(start_s*1000 + end_a)   - int(end_s*1000)

        # ─── 13) Warp global si nécessaire ───
        gen_dur = seg_audio.duration_seconds
        diff    = (end_s - start_s) - gen_dur
        if abs(diff) > 0.20:
            seg_audio = change_playback_speed(seg_audio, (end_s - start_s) / gen_dur)

        # ─── 14) Application des offsets puis mix sur timeline ───
        #  a) end-offset (allonger ou tronquer seg_audio)
        if eoff > 0:
            seg_audio = seg_audio + AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]

        #  b) calcul du point de collage
        start_ms = int(start_s * 1000) + soff
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        elif len(combined) > start_ms:
            combined = combined[:start_ms]

        combined += seg_audio

        # ─── 15) Log debug ───
        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"silences_internal={inter_applied}, "
            f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
        )

    # ─── 16) Export du debug et de l’onde ───
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path


# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
✅ Review file généré : 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250502_104831\translation_review.txt (26 segments)
✅ Parsed 26 segments depuis le review file.
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous allons voir les configura…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous verrons comment créer une…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_1.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous verrons comment la sécuri…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_2.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'La sécurité dans EPM comprendr…'
[Debug] Phrase synthesized success

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250502_104831\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250502_104831\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250502_104831\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4


424 - 07MAY - LAST VERSION

✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
[Debug] Attempt 1/10: Synthesizing phrase: 'Je vais jeter un œil aux confi…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_1.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'À partir de la page d'accueil,…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_2.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Pour obtenir un rôle commercia…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_3.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Une fois que je l'ai enregistr…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tts_segment_4.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Actuellement, je vais ouvrir u…'
[Error] Attemp

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250512_131959\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250512_131959\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250512_131959\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4


423 - 07may2025 -LAST VERSION

In [3]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.3_La création de rapports.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such “safe” break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break—just split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up


def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation—will use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe précisément à la durée allouée
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on complète par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si décal_end est négatif de D ms, on ajoute D ms à post_silence
      - Si décal_start est positif de D ms, on ajoute D ms à pre_silence
    On réécrit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*décal_start=(-?\d+)ms, décal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en mémoire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne réduit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) Réécriture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"✅ Review file ajusté selon {debug_log_path}")


def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) On regroupe et on split/merge les sous-titres exactement
       comme le fera l'audio.
    2) On écrit un review file où l'on affiche :
       - phrase par phrase (la liste exacte via "- ")
       - pre / post silence
       - voice speed
       - start/end offset
       - inter-phrase silences (N–1 valeurs pour N phrases)
    L'utilisateur peut ensuite :
      * ajuster Final Translation, Voice Speed, Pre/Post-Silence,
        Start-Offset, End-Offset
      * modifier le nombre de phrases (le parser adaptera N–1 silences).
    """

    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regrouper par phrase (détection ponctuation en fin de sous-titre)
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text):
            groups.append(cur); cur = []
    if cur:
        groups.append(cur)

    # 2) Éclatement des groupes trop longs
    def split_long(gs, max_s):
        out = []
        for g in gs:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end - start <= max_s:
                out.append(g)
            else:
                mid = len(g)//2
                out.extend([g[:mid], g[mid:]])
        return out
    groups = split_long(groups, max_group_duration_secs)

    # 3) Forcer ponctuation de fin de groupe
    i = 0
    safe_punct = re.compile(r"[.!?,;:]$")
    while i < len(groups):
        if not safe_punct.search(groups[i][-1].text.strip()):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 4) Écriture du fichier de review
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le découpage en phrases ci-dessous est **celui utilisé** en TTS.\n")
        f.write("Ajustez si besoin **Final Translation**, **Voice Speed**, **Pre/Post-Silence**, "
                "**Start-Offset:**, **End-Offset:**, **Inter-Phrase-Silence:**\n")
        f.write("mais **ne touchez pas** la liste des phrases (lignes qui commencent par '- ').\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            # Calcul des temps
            start_s = group[0].start.ordinal / 1000
            end_s   = group[-1].end.ordinal   / 1000

            # Texte original + auto-traduit
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # Découpage initial en phrases (on ne réécrit pas ces lignes, mais on calcule N)
            phrases = re.split(r"(?<=[.!?])\s+(?=[A-ZÀÂÉÈÊËÎÏÔŒÙÛÜ])", auto_tr)
            phrases = [p.strip() for p in phrases if p.strip()]

            # Préparer la liste par défaut des silences internes = N–1 × 0 ms
            n = len(phrases)
            inter_silences = ",".join("0" for _ in range(max(0, n-1)))

            # Valeurs par défaut
            pre_ms, post_ms = 0, 0
            start_offset, end_offset = 0, 0
            voice_speed = "+0%"

            # Écriture du segment
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** {voice_speed}\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Start-Offset:** {start_offset}\n")
            f.write(f"**End-Offset:** {end_offset}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_silences}\n")

            # Liste des phrases pour que l'utilisateur puisse la modifier
            for ph in phrases:
                f.write(f"- {ph}\n")

            f.write("\n----------------------------------------------------------------\n\n")

    print(f"✅ Review file créé : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer…")



def parse_review_fileOLDA(review_file_path):
    """
    Lit le review file écrit ci-dessus et
    renvoie une liste de dicts avec :
      - start_s, end_s, final_translation, voice_speed
      - pre_silence, post_silence, phrases (list)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        ft, vs, pre, post = None, "+0%", 0.0, 0.0
        orig = None
        start_offset = 0 
        phrases = []
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                 # offset en millisecondes à ajouter au start
                start_offset = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = int(line.split("**End-Offset:**",1)[1])                
            elif line.startswith("- "):
                phrases.append(line[2:].strip())
            elif line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()

        segments.append({
            "start_s":           start_s,
            "end_s":             end_s,
            "original":          orig,
            "final_translation": ft or orig,
            "voice_speed":       vs,
            "pre_silence":       pre,
            "post_silence":      post,
            "start_offset_ms":   start_offset,
            "end_offset_ms":     end_offset,
            "phrases":           phrases
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}…'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}…'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s…")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}…")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilité, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars défini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on découpe au premier “,” ou “ et ” qu’on trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

def parse_review_file(review_file_path):
    """
    Lit le review file et renvoie une liste de dicts avec :
      - start_s, end_s, original, final_translation, voice_speed
      - pre_silence, post_silence, start_offset_ms, end_offset_ms
      - phrases (list de phrases) et inter_phrase_silences (liste de silences internes)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")

    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): 
            continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        # valeurs par défaut
        ft, vs = None, "+0%"
        pre, post = 0.0, 0.0
        soffs, eoffs = 0, 0
        phrases = []
        inter = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                soffs = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                eoffs = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if parts:
                    inter = [max(0, int(x)) for x in parts.split(",")]
            elif line.startswith("- "):
                phrases.append(line[2:].strip())

        segments.append({
            "start_s": start_s,
            "end_s": end_s,
            "final_translation": ft or "",
            "voice_speed": vs,
            "pre_silence": pre,
            "post_silence": post,
            "start_offset_ms": soffs,
            "end_offset_ms": eoffs,
            "phrases": phrases,
            "inter_phrase_silences": inter
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments


async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # 1) Création du review file uniquement s'il n'existe pas encore
    if not os.path.exists(review_file_path):
        generate_translation_review_file(
            subtitle_source_path,
            review_file_path,
            max_group_duration_secs=25.0
        )
    else:
        print("✅ Review file déjà présent, on conserve vos offsets personnalisés.")


    # 2) Lecture du review file enrichi
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        # Récupération des settings
        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = seg["pre_silence"]
        post_ms = seg["post_silence"]
        soff    = seg.get("start_offset_ms", 0)
        eoff    = seg.get("end_offset_ms",   0)

        # Phrase splitting & TTS
        phrases = split_french_phrases(text)
        weights = calculate_phrase_weights(text, phrases)
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)

        # Budget pour TTS seule
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # Synthèse phrase par phrase
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s  = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await robust_synthesize_phrase(ph, tmp_mp3, rate=rate)
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # Ajustement interne par override ou répartition égale
        n_inter = max(0, len(phrases) - 1)
        if seg.get("inter_phrase_silences"):
            inter_applied = seg["inter_phrase_silences"]
            # adapter la longueur
            if len(inter_applied) < n_inter:
                inter_applied += [0] * (n_inter - len(inter_applied))
            elif len(inter_applied) > n_inter:
                inter_applied = inter_applied[:n_inter]
        else:
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            if n_inter > 0 and available > 0:
                sil_ms = available // n_inter
                inter_applied = [sil_ms] * n_inter
            else:
                inter_applied = [0] * n_inter

        # Reconstruction du segment audio
        seg_audio = AudioSegment.silent(duration=pre_ms)
        for i, aud in enumerate(phrase_audios):
            seg_audio += aud
            if i < len(inter_applied):
                seg_audio += AudioSegment.silent(duration=inter_applied[i])
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Application offset de fin
        if eoff > 0:
            seg_audio += AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]

        # Debug timing (prise en compte de soff)
        nons2 = detect_nonsilent(seg_audio, min_silence_len=1,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (total_ms - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000) + soff
        #abs_e_v = int(end_s   * 1000)
        abs_e_v = int(end_s   * 1000) + eoff
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        # Mise sur timeline avec offset de start
        start_ms = int(start_s * 1000) + soff
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        elif len(combined) > start_ms and soff < 0:
            combined = combined[:start_ms]
        combined += seg_audio

        # Enregistrement debug
        # debug.append(
        #     f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): pre={pre_ms}ms, post={post_ms}ms, "
        #     f"speed={rate}, inter={inter_applied}, "
        #     f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
        # )

        debug.append(
                   f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
                   f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
                   f"inter={inter_applied}, "
                   f"phrases={phrases}, "
                   f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
                )



    # Export debug & wav
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path


# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
✅ Review file créé : 4.2.3_La création de rapports_run_20250508_115717\translation_review.txt (27 segments)
✅ Parsed 55 segments depuis le review file.
[Debug] Attempt 1/10: Synthesizing phrase: 'Dans cette démo, nous explorer…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Comment créer un résumé des dé…'
[Error] Attempt 1/10 failed for phrase: 'Comment créer un résumé des dé…'. Exception: Cannot connect to host speech.platform.bing.com:443 ssl:<ssl.SSLContext object at 0x000001D603A71C70> [Une connexion existante a dû être fermée par l’hôte distant]
[Debug] Retrying in 2.9s…
[Debug] Attempt 2/10: Synthesizing phrase: 'Comment créer un résumé des dé…'
[Debug] Phrase synthesized successfully 

                                                                       

MoviePy - Done.
Moviepy - Writing video 4.2.3_La création de rapports_run_20250508_115717\4.2.3_La création de rapports-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.3_La création de rapports_run_20250508_115717\4.2.3_La création de rapports-french.mp4
Process completed! Output video: 4.2.3_La création de rapports_run_20250508_115717\4.2.3_La création de rapports-french.mp4


4214 - 05may2025 - LAST VERSION

In [2]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.1.4_Réalisation des suivis financiers.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such “safe” break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break—just split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up


def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation—will use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe précisément à la durée allouée
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on complète par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si décal_end est négatif de D ms, on ajoute D ms à post_silence
      - Si décal_start est positif de D ms, on ajoute D ms à pre_silence
    On réécrit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*décal_start=(-?\d+)ms, décal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en mémoire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne réduit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) Réécriture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"✅ Review file ajusté selon {debug_log_path}")


def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) On regroupe et on split/merge les sous-titres exactement
       comme le fera l'audio.
    2) On écrit un review file où l'on affiche :
       - phrase par phrase (la liste exacte via "- ")
       - pre / post silence
       - voice speed
       - start/end offset
       - inter-phrase silences (N–1 valeurs pour N phrases)
    L'utilisateur peut ensuite :
      * ajuster Final Translation, Voice Speed, Pre/Post-Silence,
        Start-Offset, End-Offset
      * modifier le nombre de phrases (le parser adaptera N–1 silences).
    """

    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regrouper par phrase (détection ponctuation en fin de sous-titre)
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text):
            groups.append(cur); cur = []
    if cur:
        groups.append(cur)

    # 2) Éclatement des groupes trop longs
    def split_long(gs, max_s):
        out = []
        for g in gs:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end - start <= max_s:
                out.append(g)
            else:
                mid = len(g)//2
                out.extend([g[:mid], g[mid:]])
        return out
    groups = split_long(groups, max_group_duration_secs)

    # 3) Forcer ponctuation de fin de groupe
    i = 0
    safe_punct = re.compile(r"[.!?,;:]$")
    while i < len(groups):
        if not safe_punct.search(groups[i][-1].text.strip()):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 4) Écriture du fichier de review
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le découpage en phrases ci-dessous est **celui utilisé** en TTS.\n")
        f.write("Ajustez si besoin **Final Translation**, **Voice Speed**, **Pre/Post-Silence**, "
                "**Start-Offset:**, **End-Offset:**, **Inter-Phrase-Silence:**\n")
        f.write("mais **ne touchez pas** la liste des phrases (lignes qui commencent par '- ').\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            # Calcul des temps
            start_s = group[0].start.ordinal / 1000
            end_s   = group[-1].end.ordinal   / 1000

            # Texte original + auto-traduit
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # Découpage initial en phrases (on ne réécrit pas ces lignes, mais on calcule N)
            phrases = re.split(r"(?<=[.!?])\s+(?=[A-ZÀÂÉÈÊËÎÏÔŒÙÛÜ])", auto_tr)
            phrases = [p.strip() for p in phrases if p.strip()]

            # Préparer la liste par défaut des silences internes = N–1 × 0 ms
            n = len(phrases)
            inter_silences = ",".join("0" for _ in range(max(0, n-1)))

            # Valeurs par défaut
            pre_ms, post_ms = 0, 0
            start_offset, end_offset = 0, 0
            voice_speed = "+0%"

            # Écriture du segment
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** {voice_speed}\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Start-Offset:** {start_offset}\n")
            f.write(f"**End-Offset:** {end_offset}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_silences}\n")

            # Liste des phrases pour que l'utilisateur puisse la modifier
            for ph in phrases:
                f.write(f"- {ph}\n")

            f.write("\n----------------------------------------------------------------\n\n")

    print(f"✅ Review file créé : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer…")



def parse_review_fileOLDA(review_file_path):
    """
    Lit le review file écrit ci-dessus et
    renvoie une liste de dicts avec :
      - start_s, end_s, final_translation, voice_speed
      - pre_silence, post_silence, phrases (list)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        ft, vs, pre, post = None, "+0%", 0.0, 0.0
        orig = None
        start_offset = 0 
        phrases = []
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                 # offset en millisecondes à ajouter au start
                start_offset = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = int(line.split("**End-Offset:**",1)[1])                
            elif line.startswith("- "):
                phrases.append(line[2:].strip())
            elif line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()

        segments.append({
            "start_s":           start_s,
            "end_s":             end_s,
            "original":          orig,
            "final_translation": ft or orig,
            "voice_speed":       vs,
            "pre_silence":       pre,
            "post_silence":      post,
            "start_offset_ms":   start_offset,
            "end_offset_ms":     end_offset,
            "phrases":           phrases
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}…'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}…'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s…")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}…")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilité, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars défini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on découpe au premier “,” ou “ et ” qu’on trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

def parse_review_file(review_file_path):
    """
    Lit le review file et renvoie une liste de dicts avec :
      - start_s, end_s, original, final_translation, voice_speed
      - pre_silence, post_silence, start_offset_ms, end_offset_ms
      - phrases (list de phrases) et inter_phrase_silences (liste de silences internes)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")

    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): 
            continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        # valeurs par défaut
        ft, vs = None, "+0%"
        pre, post = 0.0, 0.0
        soffs, eoffs = 0, 0
        phrases = []
        inter = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                soffs = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                eoffs = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if parts:
                    inter = [max(0, int(x)) for x in parts.split(",")]
            elif line.startswith("- "):
                phrases.append(line[2:].strip())

        segments.append({
            "start_s": start_s,
            "end_s": end_s,
            "final_translation": ft or "",
            "voice_speed": vs,
            "pre_silence": pre,
            "post_silence": post,
            "start_offset_ms": soffs,
            "end_offset_ms": eoffs,
            "phrases": phrases,
            "inter_phrase_silences": inter
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments


async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # 1) Création du review file uniquement s'il n'existe pas encore
    if not os.path.exists(review_file_path):
        generate_translation_review_file(
            subtitle_source_path,
            review_file_path,
            max_group_duration_secs=25.0
        )
    else:
        print("✅ Review file déjà présent, on conserve vos offsets personnalisés.")


    # 2) Lecture du review file enrichi
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        # Récupération des settings
        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = seg["pre_silence"]
        post_ms = seg["post_silence"]
        soff    = seg.get("start_offset_ms", 0)
        eoff    = seg.get("end_offset_ms",   0)

        # Phrase splitting & TTS
        phrases = split_french_phrases(text)
        weights = calculate_phrase_weights(text, phrases)
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)

        # Budget pour TTS seule
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # Synthèse phrase par phrase
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s  = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await robust_synthesize_phrase(ph, tmp_mp3, rate=rate)
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # Ajustement interne par override ou répartition égale
        n_inter = max(0, len(phrases) - 1)
        if seg.get("inter_phrase_silences"):
            inter_applied = seg["inter_phrase_silences"]
            # adapter la longueur
            if len(inter_applied) < n_inter:
                inter_applied += [0] * (n_inter - len(inter_applied))
            elif len(inter_applied) > n_inter:
                inter_applied = inter_applied[:n_inter]
        else:
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            if n_inter > 0 and available > 0:
                sil_ms = available // n_inter
                inter_applied = [sil_ms] * n_inter
            else:
                inter_applied = [0] * n_inter

        # Reconstruction du segment audio
        seg_audio = AudioSegment.silent(duration=pre_ms)
        for i, aud in enumerate(phrase_audios):
            seg_audio += aud
            if i < len(inter_applied):
                seg_audio += AudioSegment.silent(duration=inter_applied[i])
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Application offset de fin
        if eoff > 0:
            seg_audio += AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]

        # Debug timing (prise en compte de soff)
        nons2 = detect_nonsilent(seg_audio, min_silence_len=1,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (total_ms - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000) + soff
        #abs_e_v = int(end_s   * 1000)
        abs_e_v = int(end_s   * 1000) + eoff
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        # Mise sur timeline avec offset de start
        start_ms = int(start_s * 1000) + soff
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        elif len(combined) > start_ms and soff < 0:
            combined = combined[:start_ms]
        combined += seg_audio

        # Enregistrement debug
        # debug.append(
        #     f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): pre={pre_ms}ms, post={post_ms}ms, "
        #     f"speed={rate}, inter={inter_applied}, "
        #     f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
        # )

        debug.append(
                   f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
                   f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
                   f"inter={inter_applied}, "
                   f"phrases={phrases}, "
                   f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
                )



    # Export debug & wav
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path


# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
✅ Review file créé : 4.2.1.4_Réalisation des suivis financiers_run_20250507_102204\translation_review.txt (27 segments)
✅ Parsed 26 segments depuis le review file.
[Debug] Attempt 1/10: Synthesizing phrase: 'Dans cette démo, nous allons s…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous expliquerons les écarts e…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_1_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous consoliderons les prévisi…'
[Error] Attempt 1/10 failed for phrase: 'Nous consoliderons les prévisi…'. Exception: Cannot connect to host speech.platform.bing.com:443 ssl:<ssl.SSLContext object at 0x0000027F8D885130> [Une connexion existante

                                                                      

MoviePy - Done.
Moviepy - Writing video 4.2.1.4_Réalisation des suivis financiers_run_20250507_102204\4.2.1.4_Réalisation des suivis financiers-french.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready 4.2.1.4_Réalisation des suivis financiers_run_20250507_102204\4.2.1.4_Réalisation des suivis financiers-french.mp4
Process completed! Output video: 4.2.1.4_Réalisation des suivis financiers_run_20250507_102204\4.2.1.4_Réalisation des suivis financiers-french.mp4


425 - 07may2025 - LAST VERSION

In [None]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.5_Intégration des données source.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such “safe” break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break—just split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up


def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation—will use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe précisément à la durée allouée
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on complète par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si décal_end est négatif de D ms, on ajoute D ms à post_silence
      - Si décal_start est positif de D ms, on ajoute D ms à pre_silence
    On réécrit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*décal_start=(-?\d+)ms, décal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en mémoire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne réduit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) Réécriture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"✅ Review file ajusté selon {debug_log_path}")


def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) On regroupe et on split/merge les sous-titres exactement
       comme le fera l'audio.
    2) On écrit un review file où l'on affiche :
       - phrase par phrase (la liste exacte via "- ")
       - pre / post silence
       - voice speed
       - start/end offset
       - inter-phrase silences (N–1 valeurs pour N phrases)
    L'utilisateur peut ensuite :
      * ajuster Final Translation, Voice Speed, Pre/Post-Silence,
        Start-Offset, End-Offset
      * modifier le nombre de phrases (le parser adaptera N–1 silences).
    """

    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regrouper par phrase (détection ponctuation en fin de sous-titre)
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text):
            groups.append(cur); cur = []
    if cur:
        groups.append(cur)

    # 2) Éclatement des groupes trop longs
    def split_long(gs, max_s):
        out = []
        for g in gs:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end - start <= max_s:
                out.append(g)
            else:
                mid = len(g)//2
                out.extend([g[:mid], g[mid:]])
        return out
    groups = split_long(groups, max_group_duration_secs)

    # 3) Forcer ponctuation de fin de groupe
    i = 0
    safe_punct = re.compile(r"[.!?,;:]$")
    while i < len(groups):
        if not safe_punct.search(groups[i][-1].text.strip()):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 4) Écriture du fichier de review
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le découpage en phrases ci-dessous est **celui utilisé** en TTS.\n")
        f.write("Ajustez si besoin **Final Translation**, **Voice Speed**, **Pre/Post-Silence**, "
                "**Start-Offset:**, **End-Offset:**, **Inter-Phrase-Silence:**\n")
        f.write("mais **ne touchez pas** la liste des phrases (lignes qui commencent par '- ').\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            # Calcul des temps
            start_s = group[0].start.ordinal / 1000
            end_s   = group[-1].end.ordinal   / 1000

            # Texte original + auto-traduit
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # Découpage initial en phrases (on ne réécrit pas ces lignes, mais on calcule N)
            phrases = re.split(r"(?<=[.!?])\s+(?=[A-ZÀÂÉÈÊËÎÏÔŒÙÛÜ])", auto_tr)
            phrases = [p.strip() for p in phrases if p.strip()]

            # Préparer la liste par défaut des silences internes = N–1 × 0 ms
            n = len(phrases)
            inter_silences = ",".join("0" for _ in range(max(0, n-1)))

            # Valeurs par défaut
            pre_ms, post_ms = 0, 100
            start_offset, end_offset = 0, 0
            voice_speed = "+0%"

            # Écriture du segment
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** {voice_speed}\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Start-Offset:** {start_offset}\n")
            f.write(f"**End-Offset:** {end_offset}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_silences}\n")

            # Liste des phrases pour que l'utilisateur puisse la modifier
            for ph in phrases:
                f.write(f"- {ph}\n")

            f.write("\n----------------------------------------------------------------\n\n")

    print(f"✅ Review file créé : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer…")



def parse_review_fileOLDA(review_file_path):
    """
    Lit le review file écrit ci-dessus et
    renvoie une liste de dicts avec :
      - start_s, end_s, final_translation, voice_speed
      - pre_silence, post_silence, phrases (list)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        ft, vs, pre, post = None, "+0%", 0.0, 0.0
        orig = None
        start_offset = 0 
        phrases = []
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                 # offset en millisecondes à ajouter au start
                start_offset = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = int(line.split("**End-Offset:**",1)[1])                
            elif line.startswith("- "):
                phrases.append(line[2:].strip())
            elif line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()

        segments.append({
            "start_s":           start_s,
            "end_s":             end_s,
            "original":          orig,
            "final_translation": ft or orig,
            "voice_speed":       vs,
            "pre_silence":       pre,
            "post_silence":      post,
            "start_offset_ms":   start_offset,
            "end_offset_ms":     end_offset,
            "phrases":           phrases
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}…'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}…'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s…")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}…")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilité, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars défini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on découpe au premier “,” ou “ et ” qu’on trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

def parse_review_file(review_file_path):
    """
    Lit le review file et renvoie une liste de dicts avec :
      - start_s, end_s, original, final_translation, voice_speed
      - pre_silence, post_silence, start_offset_ms, end_offset_ms
      - phrases (list de phrases) et inter_phrase_silences (liste de silences internes)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")

    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): 
            continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        # valeurs par défaut
        ft, vs = None, "+0%"
        pre, post = 0.0, 0.0
        soffs, eoffs = 0, 0
        phrases = []
        inter = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                soffs = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                eoffs = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if parts:
                    inter = [max(0, int(x)) for x in parts.split(",")]
            elif line.startswith("- "):
                phrases.append(line[2:].strip())

        segments.append({
            "start_s": start_s,
            "end_s": end_s,
            "final_translation": ft or "",
            "voice_speed": vs,
            "pre_silence": pre,
            "post_silence": post,
            "start_offset_ms": soffs,
            "end_offset_ms": eoffs,
            "phrases": phrases,
            "inter_phrase_silences": inter
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments


async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # 1) Génération / mise à jour du review file
    generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        max_group_duration_secs=25.0
    )

    # 2) Lecture du review file enrichi
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        # Récupération des settings
        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = seg["pre_silence"]
        post_ms = seg["post_silence"]
        soff    = seg.get("start_offset_ms", 0)
        eoff    = seg.get("end_offset_ms",   0)

        # Phrase splitting & TTS
        phrases = split_french_phrases(text)
        weights = calculate_phrase_weights(text, phrases)
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)

        # Budget pour TTS seule
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # Synthèse phrase par phrase
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s  = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await robust_synthesize_phrase(ph, tmp_mp3, rate=rate)
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # Ajustement interne par override ou répartition égale
        n_inter = max(0, len(phrases) - 1)
        if seg.get("inter_phrase_silences"):
            inter_applied = seg["inter_phrase_silences"]
            # adapter la longueur
            if len(inter_applied) < n_inter:
                inter_applied += [0] * (n_inter - len(inter_applied))
            elif len(inter_applied) > n_inter:
                inter_applied = inter_applied[:n_inter]
        else:
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            if n_inter > 0 and available > 0:
                sil_ms = available // n_inter
                inter_applied = [sil_ms] * n_inter
            else:
                inter_applied = [0] * n_inter

        # Reconstruction du segment audio
        seg_audio = AudioSegment.silent(duration=pre_ms)
        for i, aud in enumerate(phrase_audios):
            seg_audio += aud
            if i < len(inter_applied):
                seg_audio += AudioSegment.silent(duration=inter_applied[i])
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Application offset de fin
        if eoff > 0:
            seg_audio += AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]

        # Debug timing (prise en compte de soff)
        nons2 = detect_nonsilent(seg_audio, min_silence_len=1,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (total_ms - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000) + soff
        abs_e_v = int(end_s   * 1000)
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        # Mise sur timeline avec offset de start
        start_ms = int(start_s * 1000) + soff
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        elif len(combined) > start_ms and soff < 0:
            combined = combined[:start_ms]
        combined += seg_audio

        # Enregistrement debug
        # debug.append(
        #     f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): pre={pre_ms}ms, post={post_ms}ms, "
        #     f"speed={rate}, inter={inter_applied}, "
        #     f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
        # )

        debug.append(
                   f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
                   f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
                   f"inter={inter_applied}, "
                   f"phrases={phrases}, "
                   f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
                )



    # Export debug & wav
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path


# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    # on ferme le clip audio pour libérer le fichier avant la suppression
    temp_file = "temp-audio.m4a"
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile=temp_file,
        remove_temp=True,
        threads=4
    )
    audio.close()
    video.close()
    # …et on le supprime nous-mêmes une fois les fichiers fermés
    try:
        os.remove(temp_file)
    except OSError as e:
        print(f"Error deleting temporary file {temp_file}: {e}")
        
# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
✅ Review file créé : 4.2.5_Intégration des données source_run_20250508_122756\translation_review.txt (12 segments)
✅ Parsed 12 segments depuis le review file.
[Debug] Attempt 1/10: Synthesizing phrase: 'Intégration des données source…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Comment configurer les mappage…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_1.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Comment configurer les validat…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_1_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Et comment intégrer les donnée…'
[Error] Attempt 1/10 failed for phrase: 'Et comm

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.5_Intégration des données source_run_20250508_122756\4.2.5_Intégration des données source-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.5_Intégration des données source_run_20250508_122756\4.2.5_Intégration des données source-french.mp4
Error deleting temporary file temp-audio.m4a: [WinError 2] Le fichier spécifié est introuvable: 'temp-audio.m4a'
Process completed! Output video: 4.2.5_Intégration des données source_run_20250508_122756\4.2.5_Intégration des données source-french.mp4


VERSION 04MAY : DEBUG 2 FOIS LANCÉ

In [1]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such “safe” break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break—just split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up


def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation—will use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides





def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe précisément à la durée allouée
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on complète par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")

import re

def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si déc​al_start > 0 → on ajoute d_start à Pre-Silence
      - Si déc​al_start < 0 → on ajoute d_start (négatif) à Start-Offset
      - Si déc​al_end   > 0 → on ajoute d_end   à Post-Silence
      - Si déc​al_end   < 0 → on écrit End-Offset = d_end (pour tronquer l'excès)
    On réécrit ensuite le review_file en préservant tout le reste.
    """

    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment\s+(\d+).*décal_start=(-?\d+)ms,\s*décal_end=(-?\d+)ms")
    with open(debug_log_path, encoding="utf-8") as df:
        for line in df:
            m = pattern.search(line)
            if m:
                idx   = int(m.group(1))
                d_start = int(m.group(2))
                d_end   = int(m.group(3))
                decalages[idx] = (d_start, d_end)

    # 2) Lit le review file existant
    text   = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)
    out    = []

    # 3) Pour chaque bloc non-titre, on ajuste silences et offsets
    seg_header = re.compile(r"Segment\s+(\d+)\s+\(start:")
    for blk in blocks:
        # Ne touche pas au header global
        if blk.strip().startswith("Translation Review File"):
            out.append(blk)
            continue

        m_hdr = seg_header.search(blk)
        if not m_hdr:
            out.append(blk)
            continue

        idx = int(m_hdr.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # Ajuste Pre-Silence ou Start-Offset
        def repl_pre(m):
            old = float(m.group(1))
            if d_start >= 0:
                new = old + d_start
                return f"**Pre-Silence:** {new:.0f}"
            else:
                # on conserve Pre-Silence à 0, on gère via Start-Offset
                return m.group(0)
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_start_offset(m):
            old = int(m.group(1))
            if d_start < 0:
                new = old + d_start
                return f"**Start-Offset:** {new}"
            else:
                return m.group(0)
        blk = re.sub(r"\*\*Start-Offset:\*\*\s*(-?\d+)", repl_start_offset, blk)

        # Ajuste Post-Silence ou End-Offset
        def repl_post(m):
            old = float(m.group(1))
            if d_end >= 0:
                new = old + d_end
                return f"**Post-Silence:** {new:.0f}"
            else:
                # on conserve Post-Silence à 0, on gère via End-Offset
                return m.group(0)
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        def repl_end_offset(m):
            old = int(m.group(1))
            if d_end < 0:
                # écrase ou ajoute une ligne End-Offset
                return f"**End-Offset:** {d_end}"
            else:
                return m.group(0)
        blk = re.sub(r"\*\*End-Offset:\*\*\s*(-?\d+)", repl_end_offset, blk)

        out.append(blk)

    # 4) Réécriture du fichier corrigé
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))

    print(f"✅ Review file ajusté selon {debug_log_path}")


def adjust_review_file_based_on_debug_logXX(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si décal_end est négatif de D ms, on ajoute D ms à post_silence
      - Si décal_start est positif de D ms, on ajoute D ms à pre_silence
    On réécrit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*décal_start=(-?\d+)ms, décal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en mémoire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne réduit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) Réécriture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"✅ Review file ajusté selon {debug_log_path}")


def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) On regroupe et on split/merge les sous-titres exactement
       comme le fera l'audio.
    2) On écrit un review file où l'on affiche :
       - phrase par phrase (la liste exacte via "- ")
       - pre / post silence
       - voice speed
       - start/end offset
       - inter-phrase silences (N–1 valeurs pour N phrases)
    L'utilisateur peut ensuite :
      * ajuster Final Translation, Voice Speed, Pre/Post-Silence,
        Start-Offset, End-Offset
      * modifier le nombre de phrases (le parser adaptera N–1 silences).
    """

    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regrouper par phrase (détection ponctuation en fin de sous-titre)
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text):
            groups.append(cur); cur = []
    if cur:
        groups.append(cur)

    # 2) Éclatement des groupes trop longs
    def split_long(gs, max_s):
        out = []
        for g in gs:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end - start <= max_s:
                out.append(g)
            else:
                mid = len(g)//2
                out.extend([g[:mid], g[mid:]])
        return out
    groups = split_long(groups, max_group_duration_secs)

    # 3) Forcer ponctuation de fin de groupe
    i = 0
    safe_punct = re.compile(r"[.!?,;:]$")
    while i < len(groups):
        if not safe_punct.search(groups[i][-1].text.strip()):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 4) Écriture du fichier de review
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le découpage en phrases ci-dessous est **celui utilisé** en TTS.\n")
        f.write("Ajustez si besoin **Final Translation**, **Voice Speed**, **Pre/Post-Silence**, "
                "**Start-Offset:**, **End-Offset:**, **Inter-Phrase-Silence:**\n")
        f.write("mais **ne touchez pas** la liste des phrases (lignes qui commencent par '- ').\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            # Calcul des temps
            start_s = group[0].start.ordinal / 1000
            end_s   = group[-1].end.ordinal   / 1000

            # Texte original + auto-traduit
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # Découpage initial en phrases (on ne réécrit pas ces lignes, mais on calcule N)
            phrases = re.split(r"(?<=[.!?])\s+(?=[A-ZÀÂÉÈÊËÎÏÔŒÙÛÜ])", auto_tr)
            phrases = [p.strip() for p in phrases if p.strip()]

            # Préparer la liste par défaut des silences internes = N–1 × 0 ms
            n = len(phrases)
            inter_silences = ",".join("0" for _ in range(max(0, n-1)))

            # Valeurs par défaut
            pre_ms, post_ms = 0, 100
            start_offset, end_offset = 0, 0
            voice_speed = "+0%"

            # Écriture du segment
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** {voice_speed}\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Start-Offset:** {start_offset}\n")
            f.write(f"**End-Offset:** {end_offset}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_silences}\n")

            # Liste des phrases pour que l'utilisateur puisse la modifier
            for ph in phrases:
                f.write(f"- {ph}\n")

            f.write("\n----------------------------------------------------------------\n\n")

    print(f"✅ Review file créé : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer…")



def parse_review_fileOLDA(review_file_path):
    """
    Lit le review file écrit ci-dessus et
    renvoie une liste de dicts avec :
      - start_s, end_s, final_translation, voice_speed
      - pre_silence, post_silence, phrases (list)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        ft, vs, pre, post = None, "+0%", 0.0, 0.0
        orig = None
        start_offset = 0 
        phrases = []
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                 # offset en millisecondes à ajouter au start
                start_offset = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = int(line.split("**End-Offset:**",1)[1])                
            elif line.startswith("- "):
                phrases.append(line[2:].strip())
            elif line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()

        segments.append({
            "start_s":           start_s,
            "end_s":             end_s,
            "original":          orig,
            "final_translation": ft or orig,
            "voice_speed":       vs,
            "pre_silence":       pre,
            "post_silence":      post,
            "start_offset_ms":   start_offset,
            "end_offset_ms":     end_offset,
            "phrases":           phrases
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}…'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}…'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s…")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}…")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilité, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars défini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on découpe au premier “,” ou “ et ” qu’on trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

def parse_review_file(review_file_path):
    """
    Lit le review file et renvoie une liste de dicts avec :
      - start_s, end_s, original, final_translation, voice_speed
      - pre_silence, post_silence, start_offset_ms, end_offset_ms
      - phrases (list de phrases) et inter_phrase_silences (liste de silences internes)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")

    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): 
            continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        # valeurs par défaut
        ft, vs = None, "+0%"
        pre, post = 0.0, 0.0
        soffs, eoffs = 0, 0
        phrases = []
        inter = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                soffs = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                eoffs = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if parts:
                    inter = [max(0, int(x)) for x in parts.split(",")]
            elif line.startswith("- "):
                phrases.append(line[2:].strip())

        segments.append({
            "start_s": start_s,
            "end_s": end_s,
            "final_translation": ft or "",
            "voice_speed": vs,
            "pre_silence": pre,
            "post_silence": post,
            "start_offset_ms": soffs,
            "end_offset_ms": eoffs,
            "phrases": phrases,
            "inter_phrase_silences": inter
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments


async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # 1) Génération / mise à jour du review file
    generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        max_group_duration_secs=25.0
    )

    # 2) Lecture du review file enrichi
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        # Récupération des settings
        text    = seg["final_translation"]
        rate    = seg["voice_speed"]
        pre_ms  = seg["pre_silence"]
        post_ms = seg["post_silence"]
        soff    = seg.get("start_offset_ms", 0)
        eoff    = seg.get("end_offset_ms",   0)

        # Phrase splitting & TTS
        phrases = split_french_phrases(text)
        weights = calculate_phrase_weights(text, phrases)
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)

        # Budget pour TTS seule
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # Synthèse phrase par phrase
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s  = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await robust_synthesize_phrase(ph, tmp_mp3, rate=rate)
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # Ajustement interne par override ou répartition égale
        n_inter = max(0, len(phrases) - 1)
        if seg.get("inter_phrase_silences"):
            inter_applied = seg["inter_phrase_silences"]
            # adapter la longueur
            if len(inter_applied) < n_inter:
                inter_applied += [0] * (n_inter - len(inter_applied))
            elif len(inter_applied) > n_inter:
                inter_applied = inter_applied[:n_inter]
        else:
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            if n_inter > 0 and available > 0:
                sil_ms = available // n_inter
                inter_applied = [sil_ms] * n_inter
            else:
                inter_applied = [0] * n_inter

        # Reconstruction du segment audio
        seg_audio = AudioSegment.silent(duration=pre_ms)
        for i, aud in enumerate(phrase_audios):
            seg_audio += aud
            if i < len(inter_applied):
                seg_audio += AudioSegment.silent(duration=inter_applied[i])
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Application offset de fin
        if eoff > 0:
            seg_audio += AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]

        # Debug timing (prise en compte de soff)
        nons2 = detect_nonsilent(seg_audio, min_silence_len=1,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (total_ms - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000) + soff
        abs_e_v = int(end_s   * 1000)
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        # Mise sur timeline avec offset de start
        start_ms = int(start_s * 1000) + soff
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        elif len(combined) > start_ms and soff < 0:
            combined = combined[:start_ms]
        combined += seg_audio

        # Enregistrement debug
        # debug.append(
        #     f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): pre={pre_ms}ms, post={post_ms}ms, "
        #     f"speed={rate}, inter={inter_applied}, "
        #     f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
        # )

        debug.append(
                   f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
                   f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
                   f"inter={inter_applied}, "
                   f"phrases={phrases}, "
                   f"décal_start={decal_start}ms, décal_end={decal_end}ms\n"
                )



    # Export debug & wav
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path


# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
        # ─── 1ʳᵉ passe : génère audio  debug ───
    print("Generating French audio (pass 1)  debug log...")
    await async_generate_translated_audio_with_sync_using_review(
        subtitle_file_en,
        translated_audio,
        debug_log_file,
        review_file
    )

    # ─── Ajustement automatique du review file sur la base du debug ───
    print("Adjusting review file based on debug log...")
    adjust_review_file_based_on_debug_log(debug_log_file, review_file)

    # ─── 2ᵉ passe : régénération audio avec le review file corrigé ───
    print("Generating French audio (pass 2) with adjusted settings...")
    await async_generate_translated_audio_with_sync_using_review(
        subtitle_file_en,
        translated_audio,
        debug_log_file,    # écrasera l'ancien debug_log avec de nouvelles mesures
        review_file
    )
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio (pass 1)  debug log...
✅ Review file créé : 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250504_095813\translation_review.txt (27 segments)
✅ Parsed 27 segments depuis le review file.
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous allons voir les configura…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous verrons comment créer une…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_1.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Nous verrons comment la sécuri…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_2.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'La sécurité au niveau de EPM c…'
[Error] Attempt 1/10 failed for phrase: 'La sécurité au ni

                                                                       

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250504_095813\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250504_095813\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250504_095813\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4


code01MAY_424

In [1]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent
import spacy


nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such “safe” break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break—just split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up



def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation—will use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe précisément à la durée allouée
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on complète par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si décal_end est négatif de D ms, on ajoute D ms à post_silence
      - Si décal_start est positif de D ms, on ajoute D ms à pre_silence
    On réécrit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*décal_start=(-?\d+)ms, décal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en mémoire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne réduit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) Réécriture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"✅ Review file ajusté selon {debug_log_path}")




def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    Génère un fichier de revue TTS avec :
     - votre Final Translation corrigé
     - une section Silence-Avant: pour chaque phrase (initialisée à 0)
    """
    print(f"🔍 Chargement des sous-titres depuis : {source_path}")
    try:
        subs = pysrt.open(source_path, encoding='utf-8')
    except UnicodeDecodeError:
        subs = pysrt.open(source_path, encoding='latin-1')

    translator = GoogleTranslator(source=from_lang, target=to_lang)
    nlp = spacy.load("fr_core_news_sm")

    # 1. Regroupement initial par ponctuation
     #sentence_end = re.compile(r"[.!?][\"')\]]?\s*$")
    sentence_end = re.compile(r"[.!?;,][\"')\]]?\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text.strip()):
            groups.append(cur); cur = []
    if cur: groups.append(cur)

    # 2. Découpage des groupes trop longs (> max_group_duration_secs)
    def split_long(gs, max_s):
        def split_group(g):
            start = g[0].start.ordinal / 1000
            end   = g[-1].end.ordinal   / 1000
            dur   = end - start
            if dur <= max_s or len(g) == 1:
                return [g]
            # recherche de point de rupture sûr
            for i in range(len(g)-1, 0, -1):
                if re.search(r"[.?!,;:]$", g[i].text.strip()):
                    left, right = g[:i+1], g[i+1:]
                    if left and right:
                        return split_group(left) + split_group(right)
            mid = len(g)//2
            return split_group(g[:mid]) + split_group(g[mid:])
        out = []
        for g in gs:
            out.extend(split_group(g))
        return out

    groups = split_long(groups, max_group_duration_secs)

    # 3. Fusion si manque ponctuation finale
    i = 0
    while i < len(groups):
        if not re.search(r"[.!?,;:]$", groups[i][-1].text.strip()):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 4. Écriture du fichier de revue
    print(f"✏️ Écriture du fichier de revue : {review_file_path}")
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le découpage ci-dessous est celui utilisé en TTS.\n")
        f.write("Corrigez **Final Translation**, puis renseignez **Silence-Avant**.\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            start_s = group[0].start.ordinal / 1000
            end_s   = group[-1].end.ordinal   / 1000
            original = " ".join(s.text.strip() for s in group)

            try:
                auto_tr = translator.translate(text=original)
            except Exception:
                auto_tr = "[ERREUR DE TRADUCTION]"

            # segmentation en phrases sur texte traduit
            doc = nlp(auto_tr)
            phrases = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

            total_ms = int((end_s - start_s) * 1000)
            pre_ms, post_ms = 0, 0

            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n\n")

            # Section Silence-Avant
            f.write("**Silence-Avant:**\n")
            for ph in phrases:
                f.write(f"- {ph} : 0\n")

            f.write(f"**Voice Speed:** +0%\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Start-Offset:** 0\n")
            f.write(f"**End-Offset:** 0\n")
            f.write(f"**Budget (ms):** {total_ms}\n")
            f.write("\n----------------------------------------------------------------\n\n")

    print(f"✅ Fichier de revue généré : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer…")

import pysrt
import re
from deep_translator import GoogleTranslator
import spacy

def generate_translation_review_file_from_srt(
    source_srt, review_file_path,
    from_lang="en", to_lang="fr"
):
    """
    Génère un review file où chaque item du SRT anglais
    devient un segment TTS FR avec mêmes start/end.
    """
    # 1) Chargement
    subs = pysrt.open(source_srt, encoding='utf-8')
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    nlp = spacy.load("fr_core_news_sm")

    # 2) Écriture du review file
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File (basé sur le .srt anglais)\n")
        f.write("Chaque segment ci-dessous correspond à un item du SRT original.\n")
        f.write("Corrigez **Final Translation**, **Silence-Avant** ou **Silence-Après** si nécessaire.\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, sub in enumerate(subs, start=1):
            start_s = sub.start.ordinal / 1000
            end_s   = sub.end.ordinal   / 1000
            original_en = sub.text.replace("\n", " ")

            # traduction automatique
            try:
                auto_tr = translator.translate(text=original_en)
            except Exception:
                auto_tr = "[ERREUR DE TRADUCTION]"

            # segmentation FR pour info (ne décalera pas les times)
            doc = nlp(auto_tr)
            phrases = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original (EN):** {original_en}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Phrases (info) :**\n")
            for ph in phrases:
                f.write(f"- {ph}\n")
            f.write(f"**Pre-Silence:** 0\n")
            f.write(f"**Post-Silence:** 100\n")
            f.write(f"**Start-Offset:** 0\n")
            f.write(f"**End-Offset:** 0\n")
            f.write("----------------------------------------------------------------\n\n")

    print(f"✅ Review file généré ({len(subs)} segments) : {review_file_path}")
    input("⚠️  Veuillez maintenant éditer le fichier de revue si besoin, puis appuyez sur Entrée pour continuer…")

def parse_review_file(review_file_path):
    """
    Lit le review file généré ci-dessus et
    renvoie la liste des segments avec leurs silences manuels.
    """
    text   = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")

    for blk in blocks:
        if blk.startswith("Translation Review File"):
            continue
        m = header.search(blk)
        if not m:
            continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        # valeurs par défaut
        ft            = None
        vs            = "+0%"
        pre, post     = 0.0, 0.0
        start_offset  = 0
        end_offset    = 0
        phrases       = []
        orig          = None

        # 1) Lecture des champs fixes + collecte des phrases
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()
            elif line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                start_offset = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("- "):
                phrases.append(line[2:].split(" : ")[0].strip())

        # 2) Prépare la liste des silences (initialisés à 0)
        n_inter = max(0, len(phrases) - 1)
        inter_silences = [0] * n_inter

        # 3) Lecture de la section Silence-Avant, si présente
        state = None
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Silence-Avant:**"):
                state = "silence"; continue
            if state == "silence":
                if not line:
                    state = None
                    continue
                if line.startswith("- ") and ":" in line:
                    ph_text, ms_str = line[2:].rsplit(":", 1)
                    ph_text, ms_str = ph_text.strip(), ms_str.strip()
                    try:
                        ms = int(ms_str)
                    except ValueError:
                        continue
                    if ph_text in phrases:
                        idx = phrases.index(ph_text)
                        if idx > 0:
                            inter_silences[idx-1] = ms
        # --- Nouveau bloc : calcul du budget et renommage des silences ---
        duration_ms = int((end_s - start_s) * 1000)
        # budget de base = durée segment, on y ajoutera silences internes
        budget_ms = duration_ms + sum(inter_silences)
        segments.append({
            "start_s":             start_s,
            "end_s":               end_s,
            "original":            orig,
            "final_translation":   ft or orig,
            "voice_speed":         vs,
            "pre_silence":         pre,
            "post_silence":        post,
            "start_offset_ms":     start_offset,
            "end_offset_ms":       end_offset,
            "silences_internal":   inter_silences,
            "budget_ms":           budget_ms,
            "phrases":             phrases
        })

    print(f"✅ Parsed {len(segments)} segments depuis le review file.")
    return segments


# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}…'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}…'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s…")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}…")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilité, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars défini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on découpe au premier “,” ou “ et ” qu’on trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

async def async_generate_translated_audio_with_sync_using_review(
    subtitle_file_en, output_audio_path,
    debug_log_path, review_file_path
):
    # ─── 1) Génération / mise à jour du review file ───
    # groups = generate_translation_review_file(
    #     subtitle_source_path,
    #     review_file_path,
    #     max_group_duration_secs=25.0
    # )


    generate_translation_review_file_from_srt(
        source_srt = subtitle_file_en,
        review_file_path=review_file
    )




    # ─── 2) Lecture du review file enrichi ───
    segments = parse_review_file(review_file_path)
    
        # ─── Injecter les start_offset basés sur le end_offset du segment précédent ───
    for i in range(1, len(segments)):
        prev = segments[i-1]
        segments[i]['start_offset_ms'] = - prev['end_offset_ms']


    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        # ─── 3) Récupérations de base ───
        start_s  = seg["start_s"]
        end_s    = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        text   = seg["final_translation"]
        rate   = seg["voice_speed"]
        pre_ms = seg["pre_silence"]
        post_ms= seg["post_silence"]

        # ─── 4) Utilisation des phrases définies dans le review file ───
        # si l'utilisateur a listé ses phrases, on les prend ; sinon fallback automatique
        # if seg.get("phrases"):
        #     phrases = seg["phrases"]
        # else:
        phrases = split_french_phrases(text)
        # calcul des poids sur texte final
        weights = calculate_phrase_weights(text, phrases)
        # fusion éventuelle de mini-phrases trop courtes
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40, max_chars=None)

        # ─── 6) Budget TTS versus silences pré/post ───
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # ─── 7) Synthèse phrase par phrase avec retry ───
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s   = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")

            await robust_synthesize_phrase(
                ph, tmp_mp3,
                voice="fr-FR-DeniseNeural",
                rate=rate
            )
            # aud = AudioSegment.from_mp3(tmp_mp3)
            # os.remove(tmp_mp3)
            
            # 1) charger dans une variable
            aud = AudioSegment.from_mp3(tmp_mp3)
            # 2) fermer le handle et essayer de supprimer
            try:
                os.remove(tmp_mp3)
            except PermissionError:
                # si Windows bloque, on ignore : le tmp sera nettoyé par le système ou au prochain run
                pass
            # 3) continuer à travailler sur aud

            # On ajuste strictement à la durée allouée
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # ─── 8) Anti-dépassement TTS seul ───
        sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)
        if sum_tts > content_ms and sum_tts > 0:
            factor = content_ms / sum_tts
            phrase_audios = [
                change_playback_speed(a, factor)
                for a in phrase_audios
            ]
            sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)

        # ─── 9) Silences internes (manuels ou automatiques) ───
        n_inter   = max(0, len(phrases) - 1)
        # si l'utilisateur a rempli inter_phrase_silences dans le review file, on l'utilise
        #manual_inters = seg.get("inter_phrase_silences", [])
        manual_inters = seg.get("silences_internal", [])
        if manual_inters and len(manual_inters) == n_inter:
            inter_applied = manual_inters
        else:
            # budget restant en ms pour inter-phrases
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            if n_inter > 0 and available > 0:
                sil_ms = available // n_inter
                inter_applied = [sil_ms] * n_inter
            else:
                inter_applied = [0] * n_inter
                
                

        # ─── 10) Reconstruction du segment ───
        seq = []
        for i, aud in enumerate(phrase_audios):
            seq.append(aud)
            if i < len(inter_applied):
                seq.append(AudioSegment.silent(duration=inter_applied[i]))

        seg_audio = AudioSegment.silent(duration=pre_ms)
        for clip in seq:
            seg_audio += clip
        seg_audio += AudioSegment.silent(duration=post_ms)

        # ─── 11) Strip des silences internes TTS indésirables ───
        nons = detect_nonsilent(seg_audio, min_silence_len=1,
                                silence_thresh=seg_audio.dBFS - 16)
        if nons:
            seg_audio = seg_audio[nons[0][0]:]
        seg_audio = AudioSegment.silent(duration=pre_ms) + seg_audio

        # ─── 12) Pad ou trim strict au total_ms ───
        if len(seg_audio) < total_ms:
            seg_audio += AudioSegment.silent(duration=(total_ms - len(seg_audio)))
        seg_audio = seg_audio[:total_ms]

        # ─── 13) Debug timings ───
        nons2 = detect_nonsilent(seg_audio, min_silence_len=1,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (total_ms - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000)
        abs_e_v = int(end_s   * 1000)
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        # ─── 14) Warp global si nécessaire ───
        gen_dur = seg_audio.duration_seconds
        diff    = (end_s - start_s) - gen_dur
        if abs(diff) > 0.20:
            seg_audio = change_playback_speed(seg_audio, (end_s - start_s) / gen_dur)

        # ─── 15) Mix sur la timeline avec positionnement absolu ───
        base_ms   = int(start_s * 1000)
        soff      = seg.get("start_offset_ms", 0)
        eoff      = seg.get("end_offset_ms",   0)

        # 1) Appliquer l’offset de fin sur seg_audio
        if eoff > 0:
            seg_audio = seg_audio + AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]  # tronque les |eoff| derniers ms

        # 2) Calculer le point de départ absolu
        start_ms = base_ms + soff

        # 3) Forcer combined à exactement start_ms
        if len(combined) < start_ms:
            # la timeline est trop courte → on padde jusqu’à start_ms
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        elif len(combined) > start_ms:
            # un précédent segment a débordé → on tronque pour revenir à start_ms
            combined = combined[:start_ms]

        # 4) Coller le segment audio
        combined += seg_audio


        # ─── 16) Log debug ───
        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"silences_internal={inter_applied}, "
            f"décal_start={decal_start}ms, décal_end={decal_end}ms, "
            f"phrases={phrases}\n"
        )


    # ─── 17) Export debug & wav ───
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path



# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
✅ Review file généré (128 segments) : 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250509_090058\translation_review.txt
✅ Parsed 128 segments depuis le review file.
[Debug] Attempt 1/10: Synthesizing phrase: 'Je vais jeter un œil aux confi…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'créer un rôle commercial ou un…'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_1_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Fonctionne dans l'application …'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_2_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'formulaires de données. La séc…'
[Error] Attempt 1/10 failed for 

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250509_090058\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250509_090058\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250509_090058\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
