REPLACE ONLY FINAL TRANSLATION SECTION

TESTA 18APR TRANSLATE VIDEO 

26 APR mety seg2

In [None]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"‚úÖ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such ‚Äúsafe‚Äù break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break‚Äîjust split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up





def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation‚Äîwill use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides





def parse_review_file(review_file_path):
    """
    Lit le review file et retourne une liste de dicts, un par segment, avec :
      - start_s, end_s : d√©but / fin du segment en secondes
      - original : texte original en anglais
      - final_translation : texte final en fran√ßais
      - voice_speed : vitesse synth√®se (e.g. "+0%")
      - pre_silence, post_silence : en millisecondes
      - inter_phrase_silences : liste de silences entre phrases (en ms)
      - phrases : liste de phrases fran√ßaises √† synth√©tiser
    """
    text = open(review_file_path, encoding="utf-8").read()
    # split sur toute ligne de 3+ tirets
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header_re = re.compile(
        r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)", re.I
    )

    for blk in blocks:
        if blk.startswith("Translation Review File"):
            continue
        m = header_re.search(blk)
        if not m:
            continue

        start_s = float(m.group(1))
        end_s   = float(m.group(2))

        # valeurs par d√©faut
        orig    = None
        ft      = None
        vs      = "+0%"
        pre     = 0.0
        post    = 100.0
        inter   = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()
            elif line.startswith("**Final Translation:**"):
                ft   = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs   = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try:
                    pre = float(line.split("**Pre-Silence:**",1)[1].strip())
                except ValueError:
                    pass
            elif line.startswith("**Post-Silence:**"):
                try:
                    post = float(line.split("**Post-Silence:**",1)[1].strip())
                except ValueError:
                    pass
            elif line.startswith("**Inter-Phrase-Silence:**"):
                vals = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if vals:
                    try:
                        raw = [float(x) for x in vals.split(",")]
                        # clamp √† [0, 5000] ms pour √©viter les exc√®s
                        inter = [max(0, min(v, 5000)) for v in raw]
                    except ValueError:
                        inter = []

        if orig is None:
            raise RuntimeError(f"Segment sans **Original** dans {review_file_path}")
        if ft is None:
            ft = orig  # fallback

        # d√©coupe finale en phrases fran√ßaises (accents et majuscules prises en compte)
        phrases = re.split(r"(?<=[\.!?])\s+(?=[A-Z√Ä√Ç√â√à√ä√ã√é√è√î≈í√ô√õ√ú])", ft)
        phrases = [p.strip() for p in phrases if p.strip()]

        segments.append({
            "start_s":               start_s,
            "end_s":                 end_s,
            "original":              orig,
            "final_translation":     ft,
            "voice_speed":           vs,
            "pre_silence":           pre,
            "post_silence":          post,
            "inter_phrase_silences": inter,
            "phrases":               phrases
        })

    print(f"‚úÖ Parsed {len(segments)} segments from review file.")
    return segments

def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe pr√©cis√©ment √† la dur√©e allou√©e
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on compl√®te par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(phrase: str, output_path: str, voice: str = "fr-FR-DeniseNeural", rate: str = "+0%", max_retries: int = 10):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    Note: In Edge TTS v7.0.0 the 'session' parameter is not supported.
    """
    for attempt in range(max_retries):
        try:
            # Create a new session for each attempt.
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt+1}: Synthesizing phrase: '{phrase}'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.uniform(0, 1)
            print(f"[Error] Attempt {attempt+1}/{max_retries} failed for phrase: '{phrase}'. Exception: {e}")
            print(f"[Debug] Retrying in {wait_time:.2f} seconds...")
            await asyncio.sleep(wait_time)
    raise Exception(f"Failed to synthesize phrase after {max_retries} attempts: {phrase}")

async def synthesize_phrase(phrase: str, output_path: str, voice: str = "fr-FR-DeniseNeural", rate: str = "+0%"):
    await robust_synthesize_phrase(phrase, output_path, voice, rate)

# For backward compatibility:
async def synthesize_phrase_edge_hybrid(phrase: str, output_path: str, voice: str = "fr-FR-DeniseNeural", rate: str = "+0%"):
    await synthesize_phrase(phrase, output_path, voice, rate)

def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")




def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) Regroupe les sous-titres par phrases.
    2) Scinde les groupes trop longs.
    3) √âcrit le fichier de review en indiquant :
       - pre / post silence
       - budget total pour inter-phrases
       - inter-phrase-silence par d√©faut (0 pour chaque intervalle)
    """
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regroupement par phrase
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, current = [], []
    for sub in subs:
        current.append(sub)
        if sentence_end.search(sub.text):
            groups.append(current)
            current = []
    if current:
        groups.append(current)

    # 2) D√©coupe si trop long & enforcement ponctuation
    from itertools import chain
    def split_long_groups(groups, max_s):
        new = []
        for g in groups:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end-start <= max_s:
                new.append(g)
            else:
                mid = len(g)//2
                new.extend([g[:mid], g[mid:]])
        return new
    groups = split_long_groups(groups, max_group_duration_secs)

    def enforce_punctuation_boundaries(groups):
        i = 0
        while i < len(groups):
            last = groups[i][-1].text.strip()
            if not re.search(r"[.!?,;:]$", last):
                if i+1 < len(groups):
                    groups[i] += groups.pop(i+1)
                else:
                    groups[i][-1].text += "."
            else:
                i += 1
        return groups
    groups = enforce_punctuation_boundaries(groups)

    # 3) Write review file
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Edit the **Final Translation** below. You can also adjust:\n")
        f.write("- **Voice Speed:** -10% to +10%\n")
        f.write("- **Pre-Silence/Post-Silence:** in milliseconds\n")
        f.write("- **Inter-Phrase-Silence:** comma-separated ms between phrases\n")
        f.write("  (must have one fewer value than phrases)\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            start_s = group[0].start.ordinal/1000
            end_s   = group[-1].end.ordinal/1000
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # d√©coupe en phrases (Final Translation automatique)
            phrases = re.split(
                r"(?<=[.!?])\s+(?=[A-Z√Ä√Ç√â√à√ä√ã√é√è√î≈í√ô√õ√ú])",
                auto_tr
            )
            phrases = [p.strip() for p in phrases if p.strip()]

            # budget inter-phrases
            total_ms     = int((end_s - start_s)*1000)
            pre_ms, post_ms = 100, 100
            budget_inter = max(0, total_ms - pre_ms - post_ms)

            # silences par d√©faut = 0
            inter_default = ",".join("0" for _ in range(len(phrases)-1))

            # √©criture
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s):\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** +0%\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Budget total Inter-Phrase-Silence (ms):** {budget_inter}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_default}\n")
            f.write("----------------------------------------------------------------\n\n")

    print(f"‚úÖ Review file created at: {review_file_path}  ({len(groups)} segments)")
    input("Type 'Y' when ready to continue: ")
    return groups


async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # ‚îÄ‚îÄ‚îÄ 1) G√©n√©ration / mise √† jour du review file ‚îÄ‚îÄ‚îÄ
    # Cette fonction va demander √† l'utilisateur de valider ou modifier le fichier.
    groups = generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        max_group_duration_secs=25.0
    )

    # ‚îÄ‚îÄ‚îÄ 2) Lecture du review file enrichi ‚îÄ‚îÄ‚îÄ
    # parse_review_file r√©cup√®re start_s, end_s, final_translation, voice_speed, pre/post et inter-phrase.
    segments = parse_review_file(review_file_path)

    # ‚îÄ‚îÄ‚îÄ 3) On s'assure d'avoir autant d'overrides que de groupes ‚îÄ‚îÄ‚îÄ
    default_ov = {
        "final_translation":    None,
        "voice_speed":          "+0%",
        "pre_silence":          0.0,
        "post_silence":         100.0,
        "inter_phrase_silences": []
    }
    # segments contient d√©j√† tous les champs dont on a besoin
    # Si vous voulez quand m√™me un fallback, vous pouvez l'ajouter ici.

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        # on r√©cup√®re tout directement depuis seg dict
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms= int((end_s - start_s)*1000)

        text       = seg.get("final_translation", seg.get("original", ""))
        rate       = seg.get("voice_speed", "+0%")
        pre_ms     = seg.get("pre_silence", 0.0)
        post_ms    = seg.get("post_silence", 100.0)
        inter_user = seg.get("inter_phrase_silences", [])
                # d√©coupage en phrases + poids
        phrases   = split_french_phrases(text)
        weights   = calculate_phrase_weights(text, phrases)
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # synth + ajustements identiques √† votre pipeline
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur      = content_ms * weights[i] / 1000.0
            tmp_path = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await synthesize_phrase_edge_hybrid(ph, tmp_path, voice="fr-FR-DeniseNeural", rate=rate)
            aud = AudioSegment.from_mp3(tmp_path); os.remove(tmp_path)
            aud = adjust_audio_duration(aud, dur)
            phrase_audios.append(aud)

        # # anti-d√©passement TTS seul
        # sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)
        # if sum_tts > content_ms and sum_tts>0:
        #     factor = content_ms / sum_tts
        #     phrase_audios = [change_playback_speed(a, factor) for a in phrase_audios]
        #     sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)

        # # anti-d√©passement silences users
        # available = content_ms - sum_tts
        # if available <= 0:
        #     inter_applied = [0]*len(inter_user)
        # elif sum(inter_user) <= available:
        #     inter_applied = inter_user.copy()
        # else:
        #     factor = available / sum(inter_user)
        #     inter_applied = [int(ms*factor) for ms in inter_user]
        
        # ‚îÄ‚îÄ‚îÄ ANTI-D√âPASSEMENT : PRIORIT√â AUX SILENCES UTILISATEUR ‚îÄ‚îÄ‚îÄ
        sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)
        sum_int = sum(inter_user)

        # Si TTS + silences > budget, on r√©duit d'abord le TTS
        if sum_tts + sum_int > content_ms and sum_tts > 0:
            factor_tts = max(0.0, (content_ms - sum_int) / sum_tts)
            phrase_audios = [
                change_playback_speed(aud, factor_tts)
                for aud in phrase_audios
            ]

        # On applique exactement les silences demand√©s par l'utilisateur
        inter_applied = inter_user.copy()

        # rebuild + pre/post silence + strip internal + pad/trim identique...
        seq = []
        for i, aud in enumerate(phrase_audios):
            seq.append(aud)
            if i < len(inter_applied):
                seq.append(AudioSegment.silent(duration=inter_applied[i]))

        seg_audio = AudioSegment.silent(duration=pre_ms)
        for clip in seq: seg_audio += clip
        seg_audio += AudioSegment.silent(duration=post_ms)

        non = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS-16)
        if non:
            seg_audio = seg_audio[non[0][0]:]
        seg_audio = AudioSegment.silent(duration=pre_ms) + seg_audio

        if len(seg_audio) < total_ms:
            seg_audio += AudioSegment.silent(duration=(total_ms - len(seg_audio)))
        seg_audio = seg_audio[:total_ms]

        # mesures pour debug
        non2 = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS-16)
        start_audio_ms = non2[0][0] if non2 else pre_ms
        end_audio_ms   = non2[-1][1] if non2 else (total_ms-post_ms)
        abs_start_a = int(start_s*1000)+start_audio_ms
        abs_end_a   = int(start_s*1000)+end_audio_ms
        abs_start_v = int(start_s*1000)
        abs_end_v   = int(end_s*1000)
        decal_start = abs_start_a - abs_start_v
        decal_end   = abs_end_a   - abs_end_v

        # √©ventuel warp global
        gen_dur = seg_audio.duration_seconds
        diff    = (end_s-start_s) - gen_dur
        if abs(diff) > 0.20:
            seg_audio = change_playback_speed(seg_audio, (end_s-start_s)/gen_dur)

        # mix sur timeline
        start_ms = int(start_s*1000)
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms-len(combined)))
        combined += seg_audio

        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"inter_user={inter_user} ‚Üí inter_applied={inter_applied}, "
            f"d√©cal_start={decal_start}ms, d√©cal_end={decal_end}ms, "
            f"phrases={phrases}\n"
        )

    # export
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")
    return output_audio_path

# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())
 

‚úÖ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
‚úÖ Review file created at: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250426_083624\translation_review.txt  (27 segments)
‚úÖ Parsed 27 segments from review file.
[Debug] Attempt 1: Synthesizing phrase: 'Nous allons voir les configurations de l'application EPM.'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1: Synthesizing phrase: 'Nous verrons comment cr√©er une r√®gle m√©tier ou une formule de membre.'
[Error] Attempt 1/10 failed for phrase: 'Nous verrons comment cr√©er une r√®gle m√©tier ou une formule de membre.'. Exception: Cannot connect to host speech.platform.bing.com:443 ssl:<ssl.SSLContext object at 0x00000187EB5B0320> [Une connexion existante a d√ª √™tre ferm√©e par l‚Äôh√¥te distant]
[Debug] R

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250426_083624\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250426_083624\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250426_083624\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4


26 apr to solve poids par phrase

In [None]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"‚úÖ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such ‚Äúsafe‚Äù break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break‚Äîjust split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up





def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 0.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation‚Äîwill use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides





def parse_review_file(review_file_path):
    """
    Lit le review file et retourne une liste de dicts, un par segment, avec :
      - start_s, end_s : d√©but / fin du segment en secondes
      - original : texte original en anglais
      - final_translation : texte final en fran√ßais
      - voice_speed : vitesse synth√®se (e.g. "+0%")
      - pre_silence, post_silence : en millisecondes
      - inter_phrase_silences : liste de silences entre phrases (en ms)
      - phrases : liste de phrases fran√ßaises √† synth√©tiser
    """
    text = open(review_file_path, encoding="utf-8").read()
    # split sur toute ligne de 3+ tirets
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header_re = re.compile(
        r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)", re.I
    )

    for blk in blocks:
        if blk.startswith("Translation Review File"):
            continue
        m = header_re.search(blk)
        if not m:
            continue

        start_s = float(m.group(1))
        end_s   = float(m.group(2))

        # valeurs par d√©faut
        orig    = None
        ft      = None
        vs      = "+0%"
        pre     = 0.0
        post    = 100.0
        inter   = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()
            elif line.startswith("**Final Translation:**"):
                ft   = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs   = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try:
                    pre = float(line.split("**Pre-Silence:**",1)[1].strip())
                except ValueError:
                    pass
            elif line.startswith("**Post-Silence:**"):
                try:
                    post = float(line.split("**Post-Silence:**",1)[1].strip())
                except ValueError:
                    pass
            elif line.startswith("**Inter-Phrase-Silence:**"):
                vals = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if vals:
                    try:
                        raw = [float(x) for x in vals.split(",")]
                        # clamp √† [0, 5000] ms pour √©viter les exc√®s
                        inter = [max(0, min(v, 5000)) for v in raw]
                    except ValueError:
                        inter = []

        if orig is None:
            raise RuntimeError(f"Segment sans **Original** dans {review_file_path}")
        if ft is None:
            ft = orig  # fallback

        # d√©coupe finale en phrases fran√ßaises (accents et majuscules prises en compte)
        phrases = re.split(r"(?<=[\.!?])\s+(?=[A-Z√Ä√Ç√â√à√ä√ã√é√è√î≈í√ô√õ√ú])", ft)
        phrases = [p.strip() for p in phrases if p.strip()]

        segments.append({
            "start_s":               start_s,
            "end_s":                 end_s,
            "original":              orig,
            "final_translation":     ft,
            "voice_speed":           vs,
            "pre_silence":           pre,
            "post_silence":          post,
            "inter_phrase_silences": inter,
            "phrases":               phrases
        })

    print(f"‚úÖ Parsed {len(segments)} segments from review file.")
    return segments

def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe pr√©cis√©ment √† la dur√©e allou√©e
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on compl√®te par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(phrase: str, output_path: str, voice: str = "fr-FR-DeniseNeural", rate: str = "+0%", max_retries: int = 10):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    Note: In Edge TTS v7.0.0 the 'session' parameter is not supported.
    """
    for attempt in range(max_retries):
        try:
            # Create a new session for each attempt.
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt+1}: Synthesizing phrase: '{phrase}'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.uniform(0, 1)
            print(f"[Error] Attempt {attempt+1}/{max_retries} failed for phrase: '{phrase}'. Exception: {e}")
            print(f"[Debug] Retrying in {wait_time:.2f} seconds...")
            await asyncio.sleep(wait_time)
    raise Exception(f"Failed to synthesize phrase after {max_retries} attempts: {phrase}")

async def synthesize_phrase(phrase: str, output_path: str, voice: str = "fr-FR-DeniseNeural", rate: str = "+0%"):
    await robust_synthesize_phrase(phrase, output_path, voice, rate)

# For backward compatibility:
async def synthesize_phrase_edge_hybrid(phrase: str, output_path: str, voice: str = "fr-FR-DeniseNeural", rate: str = "+0%"):
    await synthesize_phrase(phrase, output_path, voice, rate)

def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")


def merge_short_phrases(phrases: list[str],
                        weights: list[float],
                        min_chars: int = 30
                       ) -> tuple[list[str], list[float]]:
    """
    Parcourt la liste `phrases` et si une phrase fait moins de `min_chars`
    caract√®res, on la fusionne avec la phrase pr√©c√©dente, en ajustant les poids.
    Retourne (phrases_fusionnees, poids_normalises).
    """
    if not phrases:
        return [], []

    merged_phrases = [phrases[0]]
    merged_weights = [weights[0]]

    for ph, w in zip(phrases[1:], weights[1:]):
        if len(ph) < min_chars:
            # on fusionne dans l'√©l√©ment courant
            merged_phrases[-1] += " " + ph
            merged_weights[-1] += w
        else:
            merged_phrases.append(ph)
            merged_weights.append(w)

    # renormalisation
    total = sum(merged_weights)
    if total > 0:
        merged_weights = [w/total for w in merged_weights]

    return merged_phrases, merged_weights



def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) Regroupe les sous-titres par phrases.
    2) Scinde les groupes trop longs.
    3) √âcrit le fichier de review en indiquant :
       - pre / post silence
       - budget total pour inter-phrases
       - inter-phrase-silence par d√©faut (0 pour chaque intervalle)
    """
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regroupement par phrase
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, current = [], []
    for sub in subs:
        current.append(sub)
        if sentence_end.search(sub.text):
            groups.append(current)
            current = []
    if current:
        groups.append(current)

    # 2) D√©coupe si trop long & enforcement ponctuation
    from itertools import chain
    def split_long_groups(groups, max_s):
        new = []
        for g in groups:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end-start <= max_s:
                new.append(g)
            else:
                mid = len(g)//2
                new.extend([g[:mid], g[mid:]])
        return new
    groups = split_long_groups(groups, max_group_duration_secs)

    def enforce_punctuation_boundaries(groups):
        i = 0
        while i < len(groups):
            last = groups[i][-1].text.strip()
            if not re.search(r"[.!?,;:]$", last):
                if i+1 < len(groups):
                    groups[i] += groups.pop(i+1)
                else:
                    groups[i][-1].text += "."
            else:
                i += 1
        return groups
    groups = enforce_punctuation_boundaries(groups)

    # 3) Write review file
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Edit the **Final Translation** below. You can also adjust:\n")
        f.write("- **Voice Speed:** -10% to +10%\n")
        f.write("- **Pre-Silence/Post-Silence:** in milliseconds\n")
        f.write("- **Inter-Phrase-Silence:** comma-separated ms between phrases\n")
        f.write("  (must have one fewer value than phrases)\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            start_s = group[0].start.ordinal/1000
            end_s   = group[-1].end.ordinal/1000
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # d√©coupe en phrases (Final Translation automatique)
            phrases = re.split(
                r"(?<=[.!?])\s+(?=[A-Z√Ä√Ç√â√à√ä√ã√é√è√î≈í√ô√õ√ú])",
                auto_tr
            )
            phrases = [p.strip() for p in phrases if p.strip()]

            # budget inter-phrases
            total_ms     = int((end_s - start_s)*1000)
            pre_ms, post_ms = 0, 0
            budget_inter = max(0, total_ms - pre_ms - post_ms)

            # silences par d√©faut = 0
            inter_default = ",".join("0" for _ in range(len(phrases)-1))

            # √©criture
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s):\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** +0%\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Budget total Inter-Phrase-Silence (ms):** {budget_inter}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_default}\n")
            f.write("----------------------------------------------------------------\n\n")

    print(f"‚úÖ Review file created at: {review_file_path}  ({len(groups)} segments)")
    input("Type 'Y' when ready to continue: ")
    return groups


async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # ‚îÄ‚îÄ‚îÄ 1) G√©n√©ration / mise √† jour du review file ‚îÄ‚îÄ‚îÄ
    # Cette fonction va demander √† l'utilisateur de valider ou modifier le fichier.
    groups = generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        max_group_duration_secs=25.0
    )

    # ‚îÄ‚îÄ‚îÄ 2) Lecture du review file enrichi ‚îÄ‚îÄ‚îÄ
    # parse_review_file r√©cup√®re start_s, end_s, final_translation, voice_speed, pre/post et inter-phrase.
    segments = parse_review_file(review_file_path)

    # ‚îÄ‚îÄ‚îÄ 3) On s'assure d'avoir autant d'overrides que de groupes ‚îÄ‚îÄ‚îÄ
    default_ov = {
        "final_translation":    None,
        "voice_speed":          "+0%",
        "pre_silence":          0.0,
        "post_silence":         100.0,
        "inter_phrase_silences": []
    }
    # segments contient d√©j√† tous les champs dont on a besoin
    # Si vous voulez quand m√™me un fallback, vous pouvez l'ajouter ici.

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        # on r√©cup√®re tout directement depuis seg dict
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms= int((end_s - start_s)*1000)

        text       = seg.get("final_translation", seg.get("original", ""))
        rate       = seg.get("voice_speed", "+0%")
        pre_ms     = seg.get("pre_silence", 0.0)
        post_ms    = seg.get("post_silence", 100.0)
        inter_user = seg.get("inter_phrase_silences", [])
                # d√©coupage en phrases + poids
        # phrases   = split_french_phrases(text)
        # weights   = calculate_phrase_weights(text, phrases)
        
        # ‚Äî 3) split en phrases + calcul de poids initial
        phrases = split_french_phrases(text)
        weights = calculate_phrase_weights(text, phrases)

        # ‚Äî 4) fusion it√©rative de TOUTES les phrases trop courtes
        #     min_chars = seuil en nombre de caract√®res pour consid√©rer
        #     qu'une phrase est "trop petite" et doit fusionner
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=30)

        # ‚Äî 5) recalcul de content_ms (inchang√©)
        content_ms = max(0, total_ms - pre_ms - post_ms)
        

        # synth + ajustements identiques √† votre pipeline
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur      = content_ms * weights[i] / 1000.0
            tmp_path = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await synthesize_phrase_edge_hybrid(ph, tmp_path, voice="fr-FR-DeniseNeural", rate=rate)
            aud = AudioSegment.from_mp3(tmp_path); os.remove(tmp_path)
            aud = adjust_audio_duration(aud, dur)
            phrase_audios.append(aud)

        # # anti-d√©passement TTS seul
        # sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)
        # if sum_tts > content_ms and sum_tts>0:
        #     factor = content_ms / sum_tts
        #     phrase_audios = [change_playback_speed(a, factor) for a in phrase_audios]
        #     sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)

        # # anti-d√©passement silences users
        # available = content_ms - sum_tts
        # if available <= 0:
        #     inter_applied = [0]*len(inter_user)
        # elif sum(inter_user) <= available:
        #     inter_applied = inter_user.copy()
        # else:
        #     factor = available / sum(inter_user)
        #     inter_applied = [int(ms*factor) for ms in inter_user]
        
        # ‚îÄ‚îÄ‚îÄ ANTI-D√âPASSEMENT : PRIORIT√â AUX SILENCES UTILISATEUR ‚îÄ‚îÄ‚îÄ
        sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)
        sum_int = sum(inter_user)

        # Si TTS + silences > budget, on r√©duit d'abord le TTS
        if sum_tts + sum_int > content_ms and sum_tts > 0:
            factor_tts = max(0.0, (content_ms - sum_int) / sum_tts)
            phrase_audios = [
                change_playback_speed(aud, factor_tts)
                for aud in phrase_audios
            ]

        # On applique exactement les silences demand√©s par l'utilisateur
        inter_applied = inter_user.copy()

        # rebuild + pre/post silence + strip internal + pad/trim identique...
        seq = []
        for i, aud in enumerate(phrase_audios):
            seq.append(aud)
            if i < len(inter_applied):
                seq.append(AudioSegment.silent(duration=inter_applied[i]))

        seg_audio = AudioSegment.silent(duration=pre_ms)
        for clip in seq: seg_audio += clip
        seg_audio += AudioSegment.silent(duration=post_ms)

        non = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS-16)
        if non:
            seg_audio = seg_audio[non[0][0]:]
        seg_audio = AudioSegment.silent(duration=pre_ms) + seg_audio

        if len(seg_audio) < total_ms:
            seg_audio += AudioSegment.silent(duration=(total_ms - len(seg_audio)))
        seg_audio = seg_audio[:total_ms]

        # mesures pour debug
        non2 = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS-16)
        start_audio_ms = non2[0][0] if non2 else pre_ms
        end_audio_ms   = non2[-1][1] if non2 else (total_ms-post_ms)
        abs_start_a = int(start_s*1000)+start_audio_ms
        abs_end_a   = int(start_s*1000)+end_audio_ms
        abs_start_v = int(start_s*1000)
        abs_end_v   = int(end_s*1000)
        decal_start = abs_start_a - abs_start_v
        decal_end   = abs_end_a   - abs_end_v

        # √©ventuel warp global
        gen_dur = seg_audio.duration_seconds
        diff    = (end_s-start_s) - gen_dur
        if abs(diff) > 0.20:
            seg_audio = change_playback_speed(seg_audio, (end_s-start_s)/gen_dur)

        # mix sur timeline
        start_ms = int(start_s*1000)
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms-len(combined)))
        combined += seg_audio

        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"inter_user={inter_user} ‚Üí inter_applied={inter_applied}, "
            f"d√©cal_start={decal_start}ms, d√©cal_end={decal_end}ms, "
            f"phrases={phrases}\n"
        )

    # export
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")
    return output_audio_path

# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())


‚úÖ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
‚úÖ Review file created at: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250427_131337\translation_review.txt  (27 segments)
‚úÖ Parsed 27 segments from review file.
[Debug] Attempt 1: Synthesizing phrase: 'Nous allons voir les configurations de l'application EPM.'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1: Synthesizing phrase: 'Nous verrons comment cr√©er une r√®gle m√©tier ou une formule de membre.'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_1.mp3
[Debug] Attempt 1: Synthesizing phrase: 'Nous verrons comment la s√©curit√© fonctionne dans l'application EPM et nous couvrirons comment cr√©er et configurer des formulaires de donn√©es.'
[Debug] Phrase synthesized

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250427_131337\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250427_131337\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250427_131337\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4


28 APR 15h30

In [1]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"‚úÖ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.2_Flux de navigation_Avr_08_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such ‚Äúsafe‚Äù break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break‚Äîjust split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up



def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation‚Äîwill use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe pr√©cis√©ment √† la dur√©e allou√©e
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on compl√®te par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si d√©cal_end est n√©gatif de D ms, on ajoute D ms √† post_silence
      - Si d√©cal_start est positif de D ms, on ajoute D ms √† pre_silence
    On r√©√©crit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*d√©cal_start=(-?\d+)ms, d√©cal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en m√©moire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne r√©duit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) R√©√©criture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"‚úÖ Review file ajust√© selon {debug_log_path}")



def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) On regroupe et on split/merge les sous-titres exactement
       comme le fera l'audio.
    2) On √©crit un review file o√π l'on affiche :
       - phrase par phrase (la liste exacte)
       - pre / post silence
       - voice speed
    Pas d‚ÄôInter-Phrase-Silence √† remplir par l‚Äôutilisateur.
    """
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) regrouper par phrase
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text):
            groups.append(cur); cur = []
    if cur: groups.append(cur)

    # 2) split long + enforce punctuation
    def split_long(gs, max_s):
        out = []
        for g in gs:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end - start <= max_s:
                out.append(g)
            else:
                mid = len(g)//2
                out.extend([g[:mid], g[mid:]])
        return out
    groups = split_long(groups, max_group_duration_secs)

    i = 0
    while i < len(groups):
        if not re.search(r"[.!?,;:]$", groups[i][-1].text.strip()):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 3) √©crire le fichier de review
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le d√©coupage en phrases ci-dessous est **celui utilis√©** en TTS.\n")
        f.write("Ajustez si besoin **Final Translation**, **Voice Speed**, **Pre/Post-Silence**, **Start-Offset:**, **End-Offset:** \n")
        f.write("mais **ne touchez pas** √† la liste des phrases.\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            start_s = group[0].start.ordinal/1000
            end_s   = group[-1].end.ordinal/1000
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # d√©coupe en phrases & merge_short_phrases
            phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z√Ä√Ç√â√à√ä√ã√é√è√î≈í√ô√õ√ú])", auto_tr)
            phrases = [p.strip() for p in phrases if p.strip()]
            # appliquer merge_short_phrases avec vos seuils
            weights = calculate_phrase_weights(auto_tr, phrases)
            phrases, weights = merge_short_phrases(phrases, weights, min_chars=30, max_chars=90)

            total_ms = int((end_s - start_s)*1000)
            pre_ms, post_ms = 0, 100
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** +0%\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            # champs d‚Äôoffset manuels, initialis√©s √† 0
            f.write(f"**Start-Offset:** 0\n")
            f.write(f"**End-Offset:** 0\n")
            # f.write("**Phrases:**\n")
            # for ph in phrases:
            #     f.write(f"- {ph}\n")
            f.write("\n----------------------------------------------------------------\n\n")

    print(f"‚úÖ Review file cr√©√© : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer‚Ä¶")
    return  # on n‚Äôa plus besoin de renvoyer les groupes bruts


def parse_review_file(review_file_path):
    """
    Lit le review file √©crit ci-dessus et
    renvoie une liste de dicts avec :
      - start_s, end_s, final_translation, voice_speed
      - pre_silence, post_silence, phrases (list)
    """
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")
    for blk in blocks:
        m = header.search(blk)
        if not m or blk.startswith("Translation Review File"): continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        ft, vs, pre, post = None, "+0%", 0.0, 100.0
        orig = None
        start_offset = 0 
        phrases = []
        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Start-Offset:**"):
                 # offset en millisecondes √† ajouter au start
                start_offset = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = int(line.split("**End-Offset:**",1)[1])                
            elif line.startswith("- "):
                phrases.append(line[2:].strip())
            elif line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()

        segments.append({
            "start_s":           start_s,
            "end_s":             end_s,
            "original":          orig,
            "final_translation": ft or orig,
            "voice_speed":       vs,
            "pre_silence":       pre,
            "post_silence":      post,
            "start_offset_ms":   start_offset,
            "end_offset_ms":     end_offset,
            "phrases":           phrases
        })

    print(f"‚úÖ Parsed {len(segments)} segments depuis le review file.")
    return segments

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}‚Ä¶'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}‚Ä¶'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s‚Ä¶")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}‚Ä¶")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilit√©, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars d√©fini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on d√©coupe au premier ‚Äú,‚Äù ou ‚Äú et ‚Äù qu‚Äôon trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # ‚îÄ‚îÄ‚îÄ 1) G√©n√©ration / mise √† jour du review file ‚îÄ‚îÄ‚îÄ
    groups = generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        max_group_duration_secs=25.0
    )

    # ‚îÄ‚îÄ‚îÄ 2) Lecture du review file enrichi ‚îÄ‚îÄ‚îÄ
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        # ‚îÄ‚îÄ‚îÄ 3) R√©cup√©rations de base ‚îÄ‚îÄ‚îÄ
        start_s  = seg["start_s"]
        end_s    = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        text   = seg["final_translation"]
        rate   = seg["voice_speed"]
        pre_ms = seg["pre_silence"]
        post_ms= seg["post_silence"]

        # ‚îÄ‚îÄ‚îÄ 4) D√©coupage en phrases + poids ‚îÄ‚îÄ‚îÄ
        #phrases = split_french_phrases(text)
        #weights = calculate_phrase_weights(text, phrases)

        # ‚îÄ‚îÄ‚îÄ 5) Fusion des mini-phrases trop courtes ‚îÄ‚îÄ‚îÄ
        #phrases, weights = merge_short_phrases(phrases, weights, min_chars=30)
        #phrases, weights = merge_short_phrases(phrases, weights, min_chars=40)
        
        phrases = split_french_phrases(text)
        #phrases = split_long_phrases(phrases, max_chars=80)
        weights = calculate_phrase_weights(text, phrases)
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40, max_chars=None)

        # ‚îÄ‚îÄ‚îÄ 6) Budget TTS versus silences pr√©/post ‚îÄ‚îÄ‚îÄ
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # ‚îÄ‚îÄ‚îÄ 7) Synth√®se phrase par phrase avec retry ‚îÄ‚îÄ‚îÄ
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s   = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")

            await robust_synthesize_phrase(
                ph, tmp_mp3,
                voice="fr-FR-DeniseNeural",
                rate=rate
            )
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)

            # On ajuste strictement √† la dur√©e allou√©e
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # ‚îÄ‚îÄ‚îÄ 8) Anti-d√©passement TTS seul ‚îÄ‚îÄ‚îÄ
        sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)
        if sum_tts > content_ms and sum_tts > 0:
            factor = content_ms / sum_tts
            phrase_audios = [
                change_playback_speed(a, factor)
                for a in phrase_audios
            ]
            sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)

        # ‚îÄ‚îÄ‚îÄ 9) Calcul AUTOMATIQUE des silences internes ‚îÄ‚îÄ‚îÄ
        n_inter   = max(0, len(phrases) - 1)
        available = content_ms - sum_tts

        if n_inter > 0 and available > 0:
            sil_ms = available // n_inter
            inter_applied = [sil_ms] * n_inter
        else:
            inter_applied = [0] * n_inter

        # ‚îÄ‚îÄ‚îÄ 10) Reconstruction du segment ‚îÄ‚îÄ‚îÄ
        seq = []
        for i, aud in enumerate(phrase_audios):
            seq.append(aud)
            if i < len(inter_applied):
                seq.append(AudioSegment.silent(duration=inter_applied[i]))

        seg_audio = AudioSegment.silent(duration=pre_ms)
        for clip in seq:
            seg_audio += clip
        seg_audio += AudioSegment.silent(duration=post_ms)

        # ‚îÄ‚îÄ‚îÄ 11) Strip des silences internes TTS ind√©sirables ‚îÄ‚îÄ‚îÄ
        nons = detect_nonsilent(seg_audio, min_silence_len=1,
                                silence_thresh=seg_audio.dBFS - 16)
        if nons:
            seg_audio = seg_audio[nons[0][0]:]
        seg_audio = AudioSegment.silent(duration=pre_ms) + seg_audio

        # ‚îÄ‚îÄ‚îÄ 12) Pad ou trim strict au total_ms ‚îÄ‚îÄ‚îÄ
        if len(seg_audio) < total_ms:
            seg_audio += AudioSegment.silent(duration=(total_ms - len(seg_audio)))
        seg_audio = seg_audio[:total_ms]

        # ‚îÄ‚îÄ‚îÄ 13) Debug timings ‚îÄ‚îÄ‚îÄ
        nons2 = detect_nonsilent(seg_audio, min_silence_len=1,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (total_ms - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000)
        abs_e_v = int(end_s   * 1000)
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        # ‚îÄ‚îÄ‚îÄ 14) Warp global si n√©cessaire ‚îÄ‚îÄ‚îÄ
        gen_dur = seg_audio.duration_seconds
        diff    = (end_s - start_s) - gen_dur
        if abs(diff) > 0.20:
            seg_audio = change_playback_speed(seg_audio, (end_s - start_s) / gen_dur)

        # ‚îÄ‚îÄ‚îÄ 15) Mix sur la timeline avec Start-Offset et End-Offset ‚îÄ‚îÄ‚îÄ
        base_ms    = int(start_s * 1000)
        soff       = seg.get("start_offset_ms", 0)
        eoff       = seg.get("end_offset_ms",   0)

        # 1) Appliquer l‚Äôoffset de fin sur seg_audio
        if eoff > 0:
            seg_audio = seg_audio + AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]  # tronque les |eoff| derniers ms

        # 2) Calculer le placement en tenant compte du start_offset
        start_ms = base_ms + soff

        # 3) Coller sur la timeline
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        combined += seg_audio

        # ‚îÄ‚îÄ‚îÄ 16) Log debug ‚îÄ‚îÄ‚îÄ
        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"silences_internal={inter_applied}, "
            f"d√©cal_start={decal_start}ms, d√©cal_end={decal_end}ms, "
            f"phrases={phrases}\n"
        )


    # ‚îÄ‚îÄ‚îÄ 17) Export debug & wav ‚îÄ‚îÄ‚îÄ
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path






# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




‚úÖ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
‚úÖ Review file cr√©√© : 4.2.2_Flux de navigation_Avr_08_Latest_run_20250430_085112\translation_review.txt (48 segments)
‚úÖ Parsed 48 segments depuis le review file.
[Debug] Attempt 1/10: Synthesizing phrase: 'Dans cette d√©mo, nous explorer‚Ä¶'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_0_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Le flux de navigation am√©liore‚Ä¶'
[Debug] Phrase synthesized successfully to C:\Users\061181~1\AppData\Local\Temp\tmp_1_0.mp3
[Debug] Attempt 1/10: Synthesizing phrase: 'Il permet une transition fluid‚Ä¶'
[Error] Attempt 1/10 failed for phrase: 'Il permet une transition fluid‚Ä¶'. Exception: Cannot connect to host speech.platform.bing.com:443 ssl:<ssl.SSLContext object at 0x000002364ABFD640> [Une conn

                                                                      

MoviePy - Done.
Moviepy - Writing video 4.2.2_Flux de navigation_Avr_08_Latest_run_20250430_085112\4.2.2_Flux de navigation_Avr_08_Latest-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.2_Flux de navigation_Avr_08_Latest_run_20250430_085112\4.2.2_Flux de navigation_Avr_08_Latest-french.mp4
Process completed! Output video: 4.2.2_Flux de navigation_Avr_08_Latest_run_20250430_085112\4.2.2_Flux de navigation_Avr_08_Latest-french.mp4


30 APR 

In [None]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent
import spacy


nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"‚úÖ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# For this version we rely on cloud-based Edge TTS.
USE_EDGE_TTS = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")

# ============== Helper Functions (extract_audio, transcribe, etc.) ==============
def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path

# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    """
    For each group (list of SubRipItems), if its duration > max_group_duration_secs,
    split it at the *last* subtitle in that group whose text ends in punctuation
    (.,!? or comma) before the duration threshold.
    Falls back to a simple split if no such ‚Äúsafe‚Äù break exists.
    """
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s

        # if already shorter than threshold, keep it
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        # otherwise walk through, tracking safe_breaks
        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            # mark this idx if it ends in punctuation or comma
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                # if we have a safe break before or at idx, split there
                if last_safe_idx is not None:
                    # emit group up through last_safe_idx
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    # restart temp from the items after safe_idx
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    # no safe break‚Äîjust split at current idx
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end

                # reset safe marker
                last_safe_idx = None

        # anything left over
        if temp:
            new_groups.append(temp)

    return new_groups


def validate_audio_duration(original_segment, translated_audio):
    """Compares original video duration with generated audio"""
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def generate_phrase_audio(text, voice_speed):
    raw_audio = edge_tts.Communicate(text).audio
    processed = apply_speed_adjustment(raw_audio, voice_speed)
    
    # Detect and preserve natural phrase endings
    non_silent = detect_nonsilent(processed, min_silence_len=50, silence_thresh=-40)
    if non_silent:
        end_pad = 150  # Minimum ending padding
        new_end = max(non_silent[-1][1] + end_pad, len(processed))
        return processed[:new_end]
    return processed


def apply_speed_adjustment(raw_audio, speed_setting):
    speed_factor = 1 + (int(speed_setting.strip('%')) / 100)
    sped_up = raw_audio.speedup(
        playback_speed=speed_factor,
        chunk_size=150,
        crossfade=25
    )
    
    # Calculate duration difference
    original_dur = len(raw_audio)
    new_dur = len(sped_up)
    compensation = original_dur - new_dur
    
    if compensation > 0:
        return sped_up + AudioSegment.silent(duration=compensation)
    return sped_up



def parse_review_overrides(review_file_path):
    text   = open(review_file_path, "r", encoding="utf-8").read()
    # split on any line of 3+ hyphens
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    overrides = []
    for idx, blk in enumerate(blocks, start=1):
        blk = blk.strip()
        if not blk or blk.startswith("Translation Review File"):
            continue

        # defaults
        ft       = None
        vs       = "+0%"
        pre_ms   = 0.0
        post_ms  = 100.0
        inter_ms = []

        for line in blk.splitlines():
            if line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try: pre_ms = float(line.split("**Pre-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Pre-Silence")
            elif line.startswith("**Post-Silence:**"):
                try: post_ms = float(line.split("**Post-Silence:**",1)[1])
                except: print(f"[Warn] Seg {idx}: bad Post-Silence")
            elif line.startswith("**Inter-Phrase-Silence:**"):
                            parts = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                            if parts:
                                try:
                                    # Force negative values to 0 and limit to 5000ms max
                                    raw = [float(x) for x in parts.split(",")]
                                    inter_ms = [ max(0, min(x, 5000)) for x in raw ]
                                except ValueError:
                                    print(f"[Warning] Segment {idx}: invalid Inter-Phrase-Silence list")
                                    inter_ms = []

        if ft is None:
            print(f"[Warn] Seg {idx}: no Final Translation‚Äîwill use source text.")

        overrides.append({
            "final_translation":      ft,
            "voice_speed":            vs,
            "pre_silence":            pre_ms,
            "post_silence":           post_ms,
            "inter_phrase_silences":  inter_ms
        })

    print("Parsed review overrides:")
    for i,o in enumerate(overrides,1):
        print(f"  Seg {i}: final={'OK' if o['final_translation'] else '<none>'}, "
              f"speed={o['voice_speed']}, pre={o['pre_silence']}ms, post={o['post_silence']}ms, "
              f"inter={o['inter_phrase_silences']}")
    return overrides


def enforce_punctuation_boundaries(groups):
    """Ensure groups end with proper punctuation"""
    i = 0
    safe_punctuation = r"[.!?,;:]$"
    while i < len(groups):
        last_text = groups[i][-1].text.strip()
        if not re.search(safe_punctuation, last_text):
            if i+1 < len(groups):
                groups[i] += groups.pop(i+1)
            else:  # Add artificial pause for final group
                groups[i][-1].text += "."
        else:
            i += 1
    return groups



# ============== Audio Synchronization Functions ==============


def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    """
     Ajuste TTS clip pour qu'il tienne **exactement** dans target_secs :
     - Si l'audio est trop long, on le **tronque**.  
     - S'il est trop court, on ajoute du silence.  
    """
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        # on coupe pr√©cis√©ment √† la dur√©e allou√©e
        return audio[:target_ms]
    elif curr_ms < target_ms:
            # on compl√®te par du silence
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio


# ============== French Phrase Alignment Functions ==============
def split_french_phrases(text):
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============


def change_playback_speed(sound, speed=1.0):
    new_frame_rate = int(sound.frame_rate * speed)
    altered_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
    return altered_sound.set_frame_rate(sound.frame_rate)

# ============== Updated Async Audio Generation Function ==============


def validate_audio_timing(original_duration, translated_segment):
    total_audio_time = (
        translated_segment["pre_silence"] 
        + sum(translated_segment["inter_phrase_silences"]) 
        + translated_segment["post_silence"] 
        + (translated_segment["audio"].duration_seconds * 1000)
    )
    
    if total_audio_time > original_duration * 1000:
        raise ValueError(f"Audio overflow: {total_audio_time}ms vs {original_duration*1000}ms")
    elif total_audio_time < original_duration * 1000 * 0.95:
        print(f"Warning: Audio underflow by {original_duration*1000 - total_audio_time}ms")



def adjust_review_file_based_on_debug_log(debug_log_path: str, review_file_path: str):
    """
    Pour chaque segment i :
      - Si d√©cal_end est n√©gatif de D ms, on ajoute D ms √† post_silence
      - Si d√©cal_start est positif de D ms, on ajoute D ms √† pre_silence
    On r√©√©crit ensuite le review_file avec ces nouvelles valeurs.
    """
    # 1) Parse le debug log
    decalages = {}  # idx -> (d_start, d_end)
    pattern = re.compile(r"Segment (\d+).*d√©cal_start=(-?\d+)ms, d√©cal_end=(-?\d+)ms")
    for line in open(debug_log_path, encoding="utf-8"):
        m = pattern.search(line)
        if m:
            idx = int(m.group(1))
            d_start, d_end = int(m.group(2)), int(m.group(3))
            decalages[idx] = (d_start, d_end)

    # 2) Lit tout le review file en m√©moire
    text = open(review_file_path, encoding="utf-8").read()
    blocks = re.split(r"(?m)^-{3,}\s*$", text)

    out = []
    for blk in blocks:
        if not blk.strip() or blk.startswith("Translation Review File"):
            out.append(blk)
            continue

        # trouve le segment
        header = re.search(r"Segment\s+(\d+)\s+\(", blk)
        if not header:
            out.append(blk); continue
        idx = int(header.group(1))
        d_start, d_end = decalages.get(idx, (0, 0))

        # remplace les lignes Pre-Silence / Post-Silence
        def repl_pre(m):
            old = float(m.group(1))
            new = max(0.0, old + d_start)
            return f"**Pre-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Pre-Silence:\*\*\s*([0-9.]+)", repl_pre, blk)

        def repl_post(m):
            old = float(m.group(1))
            # si d_end<0, audio est trop long => il a fallu tronquer => on ne r√©duit pas post
            # si d_end>0, audio trop court => on ajoute
            new = max(0.0, old + d_end)
            return f"**Post-Silence:** {new:.0f}"
        blk = re.sub(r"\*\*Post-Silence:\*\*\s*([0-9.]+)", repl_post, blk)

        out.append(blk)

    # 3) R√©√©criture du fichier
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("\n---\n".join(out))
    print(f"‚úÖ Review file ajust√© selon {debug_log_path}")




def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    Version robuste : g√©n√®re un fichier de revue pour la traduction vocale.
    - G√®re les erreurs, le d√©coupage, la traduction, les cas limites.
    """
    print(f"üîç Chargement des sous-titres depuis : {source_path}")
    try:
        subs = pysrt.open(source_path, encoding='utf-8')
    except UnicodeDecodeError:
        subs = pysrt.open(source_path, encoding='latin-1')
    except Exception as e:
        print(f"‚ùå Erreur lors de l'ouverture du fichier : {e}")
        return

    translator = GoogleTranslator(source=from_lang, target=to_lang)
    nlp = spacy.load("fr_core_news_sm")

    # 1. Regrouper en phrases
    print("üß† Regroupement initial par ponctuation finale")
    sentence_end = re.compile(r"[.!?][\"')\]]?\s*$")
    groups, cur = [], []
    for sub in subs:
        cur.append(sub)
        if sentence_end.search(sub.text.strip()):
            groups.append(cur)
            cur = []
    if cur:
        groups.append(cur)

    # 2. Split les groupes trop longs
    def split_long(gs, max_s):
        def split_group(g):
            start = g[0].start.ordinal / 1000
            end = g[-1].end.ordinal / 1000
            duration = end - start

            if duration <= max_s or len(g) == 1:
                return [g]

            for i in range(len(g) - 1, 0, -1):
                if re.search(r"[.?!,;:]$", g[i].text.strip()):
                    left, right = g[:i + 1], g[i + 1:]
                    if len(left) == 0 or len(right) == 0:
                        break
                    return split_group(left) + split_group(right)

            mid = len(g) // 2
            if mid == 0 or mid == len(g):
                return [g]
            return split_group(g[:mid]) + split_group(g[mid:])

        out = []
        for g in gs:
            out.extend(split_group(g))
        return out

    print("‚úÇÔ∏è D√©coupage des groupes trop longs‚Ä¶")
    groups = split_long(groups, max_group_duration_secs)

    # 3. Merge finaux si fin de phrase manquante
    i = 0
    while i < len(groups):
        if not re.search(r"[.!?,;:]$", groups[i][-1].text.strip()):
            if i + 1 < len(groups):
                print(f"‚ö†Ô∏è Merge groupe {i} et {i+1} car fin de phrase absente")
                groups[i] += groups.pop(i + 1)
                continue
            else:
                groups[i][-1].text += "."
        i += 1

    # 4. G√©n√©ration du fichier de revue
    print(f"‚úèÔ∏è √âcriture du fichier de revue : {review_file_path}")
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Le d√©coupage en phrases ci-dessous est **celui utilis√©** en TTS.\n")
        f.write("Ajustez si besoin **Final Translation**, **Voice Speed**, **Pre/Post-Silence**, **Inter-Phrase-Silence:**, **Start-Offset**, **End-Offset**\n")
        f.write("Mais **ne touchez pas** √† la liste des phrases.\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            start_s = group[0].start.ordinal / 1000
            end_s = group[-1].end.ordinal / 1000
            original = " ".join(s.text.strip() for s in group)

            try:
                auto_tr = translator.translate(text=original)
            except Exception as e:
                print(f"‚ùå Erreur traduction segment {idx} : {e}")
                auto_tr = "[ERREUR DE TRADUCTION]"

            # Segmentation avec spaCy
            doc = nlp(auto_tr)
            phrases = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
            inter_ms = [0 for _ in range(max(0, len(phrases)-1))]
            total_ms = int((end_s - start_s) * 1000)
            pre_ms, post_ms = 0, 100
           

            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s)\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write("**Phrases (ne modifiez pas) :**\n")
            for ph in phrases:
                f.write(f"- {ph}\\n")
            f.write(f"**Voice Speed:** +0%\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Inter-Phrase-Silence:** {','.join(str(ms) for ms in inter_ms)}\n")
            f.write(f"**Start-Offset:** 0\n")
            f.write(f"**End-Offset:** 0\n")
            f.write(f"**Budget (ms)** : {total_ms}\n")
            f.write("\n----------------------------------------------------------------\n\n")

    print(f"‚úÖ Fichier de revue g√©n√©r√© : {review_file_path} ({len(groups)} segments)")
    input("Tapez 'Y' pour continuer‚Ä¶")




def parse_review_file(review_file_path):
    """
    Lit le review file √©crit ci-dessus et
    renvoie une liste de dicts avec :
      - start_s, end_s, final_translation, voice_speed
      - pre_silence, post_silence, start_offset_ms, end_offset_ms, inter_phrase_silences, phrases (list)
    """
    text = open(review_file_path, encoding="utf-8").read()
    # S√©paration des blocs par lignes de ---
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header = re.compile(r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)")

    for blk in blocks:
        # ignorer l'en-t√™te global
        if blk.startswith("Translation Review File"): continue
        m = header.search(blk)
        if not m: continue
        start_s, end_s = float(m.group(1)), float(m.group(2))

        # valeurs par d√©faut
        ft = None
        vs = "+0%"
        pre, post = 0.0, 0.0
        start_offset = 0
        end_offset = 0
        inter_ms = []
        phrases = []
        orig = None

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()
            elif line.startswith("**Final Translation:**"):
                ft = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                pre = float(line.split("**Pre-Silence:**",1)[1])
            elif line.startswith("**Post-Silence:**"):
                post = float(line.split("**Post-Silence:**",1)[1])
            elif line.startswith("**Inter-Phrase-Silence:**"):
                vals = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                inter_ms = [float(x) for x in vals.split(",") if x.strip()]
            elif line.startswith("**Start-Offset:**"):
                start_offset = int(line.split("**Start-Offset:**",1)[1])
            elif line.startswith("**End-Offset:**"):
                end_offset = int(line.split("**End-Offset:**",1)[1])
            elif line.startswith("- "):
                phrases.append(line[2:].strip())

        segments.append({
            "start_s":           start_s,
            "end_s":             end_s,
            "original":          orig,
            "final_translation": ft or orig,
            "voice_speed":       vs,
            "pre_silence":       pre,
            "post_silence":      post,
            "start_offset_ms":   start_offset,
            "end_offset_ms":     end_offset,
            "inter_phrase_silences": inter_ms,
            "phrases":           phrases
        })

    print(f"‚úÖ Parsed {len(segments)} segments depuis le review file.")
    return segments

# ============== TTS Functions: Edge TTS Only with Debug Logging ==============
async def robust_synthesize_phrase(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%",
    max_retries: int = 10
):
    """
    Synthesize speech using Edge TTS with robust retry logic.
    Detailed debug messages are printed for each attempt.
    """
    for attempt in range(1, max_retries+1):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt}/{max_retries}: Synthesizing phrase: '{phrase[:30]}‚Ä¶'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.random()
            print(f"[Error] Attempt {attempt}/{max_retries} failed for phrase: '{phrase[:30]}‚Ä¶'. Exception: {e}")
            if attempt < max_retries:
                print(f"[Debug] Retrying in {wait_time:.1f}s‚Ä¶")
                await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase[:30]}‚Ä¶")

async def synthesize_phrase_edge_hybrid(
    phrase: str,
    output_path: str,
    voice: str = "fr-FR-DeniseNeural",
    rate: str = "+0%"
):
    # Pour compatibilit√©, on redirige vers le robust_synthesize
    await robust_synthesize_phrase(phrase, output_path, voice, rate)


def merge_short_phrases(phrases, weights, min_chars=40, max_chars=None):
    new_ph, new_wt = [], []
    buf_ph, buf_wt = "", 0.0
    for ph, wt in zip(phrases, weights):
        if not buf_ph:
            buf_ph, buf_wt = ph, wt
        else:
            if len(buf_ph) < min_chars or len(ph) < min_chars:
                cand = buf_ph + " " + ph
                # si pas de max_chars d√©fini, on fusionne sans condition
                cond = True if max_chars is None else (len(cand) <= max_chars)
                if cond:
                    buf_ph = cand
                    buf_wt += wt
                else:
                    new_ph.append(buf_ph)
                    new_wt.append(buf_wt)
                    buf_ph, buf_wt = ph, wt
            else:
                new_ph.append(buf_ph)
                new_wt.append(buf_wt)
                buf_ph, buf_wt = ph, wt
    if buf_ph:
        new_ph.append(buf_ph)
        new_wt.append(buf_wt)
    return new_ph, new_wt



def split_long_phrasesaaa(phrases, max_chars=80):
    new = []
    for p in phrases:
        if len(p) > max_chars:
            # on d√©coupe au premier ‚Äú,‚Äù ou ‚Äú et ‚Äù qu‚Äôon trouve
            parts = re.split(r",\s+| et ", p, maxsplit=1)
            new.extend([parts[0].strip(), parts[1].strip()] if len(parts)==2 else [p])
        else:
            new.append(p)
    return new

async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # ‚îÄ‚îÄ‚îÄ 1) G√©n√©ration / mise √† jour du review file ‚îÄ‚îÄ‚îÄ
    groups = generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        max_group_duration_secs=25.0
    )

    # ‚îÄ‚îÄ‚îÄ 2) Lecture du review file enrichi ‚îÄ‚îÄ‚îÄ
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        # ‚îÄ‚îÄ‚îÄ 3) R√©cup√©rations de base ‚îÄ‚îÄ‚îÄ
        start_s  = seg["start_s"]
        end_s    = seg["end_s"]
        total_ms = int((end_s - start_s) * 1000)

        text   = seg["final_translation"]
        rate   = seg["voice_speed"]
        pre_ms = seg["pre_silence"]
        post_ms= seg["post_silence"]

        # ‚îÄ‚îÄ‚îÄ 4) Utilisation des phrases d√©finies dans le review file ‚îÄ‚îÄ‚îÄ
        # si l'utilisateur a list√© ses phrases, on les prend ; sinon fallback automatique
        # if seg.get("phrases"):
        #     phrases = seg["phrases"]
        # else:
        phrases = split_french_phrases(text)
        # calcul des poids sur texte final
        weights = calculate_phrase_weights(text, phrases)
        # fusion √©ventuelle de mini-phrases trop courtes
        phrases, weights = merge_short_phrases(phrases, weights, min_chars=40, max_chars=None)

        # ‚îÄ‚îÄ‚îÄ 6) Budget TTS versus silences pr√©/post ‚îÄ‚îÄ‚îÄ
        content_ms = max(0, total_ms - pre_ms - post_ms)

        # ‚îÄ‚îÄ‚îÄ 7) Synth√®se phrase par phrase avec retry ‚îÄ‚îÄ‚îÄ
        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur_s   = (content_ms * weights[i]) / 1000.0
            tmp_mp3 = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")

            await robust_synthesize_phrase(
                ph, tmp_mp3,
                voice="fr-FR-DeniseNeural",
                rate=rate
            )
            aud = AudioSegment.from_mp3(tmp_mp3)
            os.remove(tmp_mp3)

            # On ajuste strictement √† la dur√©e allou√©e
            aud = adjust_audio_duration(aud, dur_s)
            phrase_audios.append(aud)

        # ‚îÄ‚îÄ‚îÄ 8) Anti-d√©passement TTS seul ‚îÄ‚îÄ‚îÄ
        sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)
        if sum_tts > content_ms and sum_tts > 0:
            factor = content_ms / sum_tts
            phrase_audios = [
                change_playback_speed(a, factor)
                for a in phrase_audios
            ]
            sum_tts = sum(a.duration_seconds * 1000 for a in phrase_audios)

        # ‚îÄ‚îÄ‚îÄ 9) Silences internes (manuels ou automatiques) ‚îÄ‚îÄ‚îÄ
        n_inter   = max(0, len(phrases) - 1)
        # si l'utilisateur a rempli inter_phrase_silences dans le review file, on l'utilise
        manual_inters = seg.get("inter_phrase_silences", [])
        if manual_inters and len(manual_inters) == n_inter:
            inter_applied = manual_inters
        else:
            # budget restant en ms pour inter-phrases
            available = content_ms - sum(a.duration_seconds * 1000 for a in phrase_audios)
            if n_inter > 0 and available > 0:
                sil_ms = available // n_inter
                inter_applied = [sil_ms] * n_inter
            else:
                inter_applied = [0] * n_inter
                
                

        # ‚îÄ‚îÄ‚îÄ 10) Reconstruction du segment ‚îÄ‚îÄ‚îÄ
        seq = []
        for i, aud in enumerate(phrase_audios):
            seq.append(aud)
            if i < len(inter_applied):
                seq.append(AudioSegment.silent(duration=inter_applied[i]))

        seg_audio = AudioSegment.silent(duration=pre_ms)
        for clip in seq:
            seg_audio += clip
        seg_audio += AudioSegment.silent(duration=post_ms)

        # ‚îÄ‚îÄ‚îÄ 11) Strip des silences internes TTS ind√©sirables ‚îÄ‚îÄ‚îÄ
        nons = detect_nonsilent(seg_audio, min_silence_len=1,
                                silence_thresh=seg_audio.dBFS - 16)
        if nons:
            seg_audio = seg_audio[nons[0][0]:]
        seg_audio = AudioSegment.silent(duration=pre_ms) + seg_audio

        # ‚îÄ‚îÄ‚îÄ 12) Pad ou trim strict au total_ms ‚îÄ‚îÄ‚îÄ
        if len(seg_audio) < total_ms:
            seg_audio += AudioSegment.silent(duration=(total_ms - len(seg_audio)))
        seg_audio = seg_audio[:total_ms]

        # ‚îÄ‚îÄ‚îÄ 13) Debug timings ‚îÄ‚îÄ‚îÄ
        nons2 = detect_nonsilent(seg_audio, min_silence_len=1,
                                 silence_thresh=seg_audio.dBFS - 16)
        start_a = nons2[0][0] if nons2 else pre_ms
        end_a   = nons2[-1][1] if nons2 else (total_ms - post_ms)
        abs_s_a = int(start_s * 1000) + start_a
        abs_e_a = int(start_s * 1000) + end_a
        abs_s_v = int(start_s * 1000)
        abs_e_v = int(end_s   * 1000)
        decal_start = abs_s_a - abs_s_v
        decal_end   = abs_e_a - abs_e_v

        # ‚îÄ‚îÄ‚îÄ 14) Warp global si n√©cessaire ‚îÄ‚îÄ‚îÄ
        gen_dur = seg_audio.duration_seconds
        diff    = (end_s - start_s) - gen_dur
        if abs(diff) > 0.20:
            seg_audio = change_playback_speed(seg_audio, (end_s - start_s) / gen_dur)

        # ‚îÄ‚îÄ‚îÄ 15) Mix sur la timeline avec positionnement absolu ‚îÄ‚îÄ‚îÄ
        base_ms   = int(start_s * 1000)
        soff      = seg.get("start_offset_ms", 0)
        eoff      = seg.get("end_offset_ms",   0)

        # 1) Appliquer l‚Äôoffset de fin sur seg_audio
        if eoff > 0:
            seg_audio = seg_audio + AudioSegment.silent(duration=eoff)
        elif eoff < 0:
            seg_audio = seg_audio[:eoff]  # tronque les |eoff| derniers ms

        # 2) Calculer le point de d√©part absolu
        start_ms = base_ms + soff

        # 3) Forcer combined √† exactement start_ms
        if len(combined) < start_ms:
            # la timeline est trop courte ‚Üí on padde jusqu‚Äô√† start_ms
            combined += AudioSegment.silent(duration=(start_ms - len(combined)))
        elif len(combined) > start_ms:
            # un pr√©c√©dent segment a d√©bord√© ‚Üí on tronque pour revenir √† start_ms
            combined = combined[:start_ms]

        # 4) Coller le segment audio
        combined += seg_audio


        # ‚îÄ‚îÄ‚îÄ 16) Log debug ‚îÄ‚îÄ‚îÄ
        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"silences_internal={inter_applied}, "
            f"d√©cal_start={decal_start}ms, d√©cal_end={decal_end}ms, "
            f"phrases={phrases}\n"
        )


    # ‚îÄ‚îÄ‚îÄ 17) Export debug & wav ‚îÄ‚îÄ‚îÄ
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")

    return output_audio_path



# ============== Merge Audio and Video Function ==============
def merge_audio_video():
    video = VideoFileClip(input_video)
    audio = AudioFileClip(translated_audio)
    if audio.duration < video.duration:
        extra_silence = AudioSegment.silent(duration=(video.duration - audio.duration) * 1000)
        audio_path_temp = os.path.join(output_dir, "temp_full_audio.wav")
        audio_seg = AudioSegment.from_file(translated_audio, format="wav")
        full_audio = audio_seg + extra_silence
        full_audio.export(audio_path_temp, format="wav")
        audio = AudioFileClip(audio_path_temp)
    video = video.set_audio(audio)
    video.write_videofile(
        output_video,
        codec="libx264",
        audio_codec="aac",
        temp_audiofile="temp-audio.m4a",
        remove_temp=True,
        threads=4
    )

# ============== Main Asynchronous Flow ==============
async def async_main():
    print("Extracting audio...")
    audio_path = extract_audio()
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    print("Generating French audio with synchronization and manual overrides...")
    await async_generate_translated_audio_with_sync_using_review(subtitle_file_en, translated_audio, debug_log_file, review_file)
    print("Merging audio and video...")
    merge_audio_video()
    print(f"Process completed! Output video: {output_video}")

if __name__ == "__main__":
    asyncio.run(async_main())




‚úÖ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Generating French audio with synchronization and manual overrides...
üîç Chargement des sous-titres depuis : 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250430_175831\4.2.4_Configuration de la solution_Avr_10_Latest-english.srt
üß† Regroupement initial par ponctuation finale
‚úÇÔ∏è D√©coupage des groupes trop longs‚Ä¶
‚ö†Ô∏è Merge groupe 0 et 1 car fin de phrase absente
‚ö†Ô∏è Merge groupe 0 et 1 car fin de phrase absente
‚ö†Ô∏è Merge groupe 1 et 2 car fin de phrase absente
‚ö†Ô∏è Merge groupe 1 et 2 car fin de phrase absente
‚ö†Ô∏è Merge groupe 1 et 2 car fin de phrase absente
‚ö†Ô∏è Merge groupe 2 et 3 car fin de phrase absente
‚ö†Ô∏è Merge groupe 2 et 3 car fin de phrase absente
‚ö†Ô∏è Merge groupe 2 et 3 car fin de phrase absente
‚ö†Ô∏è Merge groupe 2 et 3 car fin de phrase absente
‚ö†Ô∏è Merge groupe 2 et 3 car fin de phrase abs

                                                                        

MoviePy - Done.
Moviepy - Writing video 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250430_175831\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250430_175831\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: 4.2.4_Configuration de la solution_Avr_10_Latest_run_20250430_175831\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4


24apr perplexity

In [None]:
import os
import re
import ffmpeg
import pysrt
import time
from deep_translator import GoogleTranslator
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
from faster_whisper import WhisperModel
from shutil import which
import nest_asyncio
from datetime import datetime
import tempfile
import asyncio
import edge_tts
import aiohttp
import ssl
import random
from pydub.silence import detect_nonsilent

nest_asyncio.apply()

# ----- Configuration -----
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"‚úÖ ffmpeg found at: {ffmpeg_path}")

input_video = "to translate/4.2.4_Configuration de la solution_Avr_10_Latest.mp4"
base_name = os.path.splitext(os.path.basename(input_video))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{base_name}_run_{timestamp}"
model_size = "small"
update_existing = True

# Files and paths
os.makedirs(output_dir, exist_ok=True)
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")
review_file = os.path.join(output_dir, "translation_review.txt")
debug_log_file = os.path.join(output_dir, "translation_debug_log.txt")


# ============== Helper Functions ==============

def extract_audio():
    try:
        (ffmpeg
         .input(input_video)
         .output(extracted_audio, ac=1, ar=16000)
         .overwrite_output()
         .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print("STDOUT:", e.stdout.decode("utf8"))
        print("STDERR:", e.stderr.decode("utf8"))
        raise

def transcribe(audio_path):
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)
    language = info.language
    print(f"Detected language: {language}")
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })
    return language, transcript_segments

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(hours=hours, minutes=minutes, seconds=int(seconds), milliseconds=milliseconds)

def generate_subtitle_file(segments, output_path):
    subs = pysrt.SubRipFile()
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),
            end=time_to_subrip(segment["end"]),
            text=segment["text"]
        )
        subs.append(sub)
    subs.save(output_path, encoding="utf-8")
    return output_path


# ============== Translation & Review Functions ==============

def split_long_groups(groups, max_group_duration_secs):
    new_groups = []
    for group in groups:
        start_s = group[0].start.ordinal / 1000
        end_s   = group[-1].end.ordinal   / 1000
        total   = end_s - start_s
        if total <= max_group_duration_secs:
            new_groups.append(group)
            continue

        temp = []
        temp_start = start_s
        last_safe_idx = None
        for idx, item in enumerate(group):
            temp.append(item)
            if re.search(r"[.,!?]$", item.text.strip()):
                last_safe_idx = idx

            current_end = item.end.ordinal / 1000
            if (current_end - temp_start) >= max_group_duration_secs:
                if last_safe_idx is not None:
                    safe_group = temp[: last_safe_idx+1 ]
                    new_groups.append(safe_group)
                    temp = temp[last_safe_idx+1 :]
                    temp_start = temp[0].start.ordinal / 1000 if temp else current_end
                else:
                    new_groups.append(temp)
                    temp = []
                    temp_start = current_end
                last_safe_idx = None

        if temp:
            new_groups.append(temp)

    return new_groups

def validate_audio_duration(original_segment, translated_audio):
    video_dur = original_segment['end'] - original_segment['start']
    audio_dur = translated_audio.duration_seconds
    
    if abs(video_dur - audio_dur) > 0.5:  # 500ms tolerance
        compensation = (video_dur - audio_dur) * 1000  # ms
        if compensation > 0:
            return translated_audio + AudioSegment.silent(duration=compensation)
        else:
            return translated_audio[:int(compensation*1000)]  # ms to samples
    return translated_audio

def adjust_audio_duration(audio: AudioSegment, target_secs: float) -> AudioSegment:
    target_ms = int(target_secs * 1000)
    curr_ms   = len(audio)
    if curr_ms > target_ms:
        return audio[:target_ms]
    elif curr_ms < target_ms:
        return audio + AudioSegment.silent(duration=(target_ms - curr_ms))
    return audio

def split_french_phrases(text):
    # Improved splitting on punctuation followed by uppercase or accented uppercase letters
    phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z√Ä√Ç√â√à√ä√ã√é√è√î≈í√ô√õ√ú])", text)
    return [phrase.strip() for phrase in phrases if phrase.strip()]

def calculate_phrase_weights(original_text, translated_phrases):
    fr_phrase_word_counts = [len(phrase.split()) for phrase in translated_phrases]
    total_fr_words = sum(fr_phrase_word_counts)
    if total_fr_words == 0:
        return [1 / len(translated_phrases)] * len(translated_phrases)
    return [count / total_fr_words for count in fr_phrase_word_counts]

async def robust_synthesize_phrase(phrase: str, output_path: str, voice: str = "fr-FR-DeniseNeural", rate: str = "+0%", max_retries: int = 5):
    for attempt in range(max_retries):
        try:
            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session:
                communicate = edge_tts.Communicate(
                    text=phrase,
                    voice=voice,
                    rate=rate
                )
                print(f"[Debug] Attempt {attempt+1}: Synthesizing phrase: '{phrase}'")
                await communicate.save(output_path)
                print(f"[Debug] Phrase synthesized successfully to {output_path}")
                return
        except Exception as e:
            wait_time = 2 ** attempt + random.uniform(0, 1)
            print(f"[Warning] Synthesis failed on attempt {attempt+1} for phrase '{phrase}': {e}. Retrying in {wait_time:.1f}s...")
            await asyncio.sleep(wait_time)
    raise RuntimeError(f"Failed to synthesize phrase after {max_retries} attempts: {phrase}")

def change_playback_speed(sound: AudioSegment, speed=1.0):
    # Change playback speed without altering pitch drastically
    # Using pydub speedup with small chunk size and crossfade
    if speed == 1.0:
        return sound
    return sound.speedup(playback_speed=speed, chunk_size=150, crossfade=25)

# ============== Main Function with Improved Inter-Phrase Silence Handling ==============

async def async_generate_translated_audio_with_sync_using_review(
    subtitle_source_path, output_audio_path,
    debug_log_path, review_file_path
):
    # Step 1: Generate or update review file (user validation/modification)
    groups = generate_translation_review_file(
        subtitle_source_path,
        review_file_path,
        from_lang="en", to_lang="fr",
        max_group_duration_secs=25.0
    )
    
   
    print(f"Generated {len(groups)} groups for review.")
    # Step 2: Parse enriched review file
    segments = parse_review_file(review_file_path)

    combined = AudioSegment.silent(duration=0)
    debug    = []

    for idx, seg in enumerate(segments):
        start_s = seg["start_s"]
        end_s   = seg["end_s"]
        total_ms= int((end_s - start_s)*1000)

        text       = seg.get("final_translation", seg.get("original", ""))
        rate       = seg.get("voice_speed", "+0%")
        pre_ms     = seg.get("pre_silence", 0.0)
        post_ms    = seg.get("post_silence", 100.0)
        inter_user = seg.get("inter_phrase_silences", [])

        phrases   = split_french_phrases(text)
        if not phrases:
            print(f"[Warning] No phrases found for segment {idx+1}. Skipping.")
            debug.append(f"Segment {idx+1}: No phrases found. Skipping.\n")
            continue

        weights   = calculate_phrase_weights(text, phrases)
        content_ms = max(0, total_ms - pre_ms - post_ms)

        phrase_audios = []
        for i, ph in enumerate(phrases):
            dur      = content_ms * weights[i] / 1000.0
            tmp_path = os.path.join(tempfile.gettempdir(), f"tmp_{idx}_{i}.mp3")
            await robust_synthesize_phrase(ph, tmp_path, voice="fr-FR-DeniseNeural", rate=rate)
            aud = AudioSegment.from_mp3(tmp_path)
            os.remove(tmp_path)
            aud = adjust_audio_duration(aud, dur)
            phrase_audios.append(aud)

        # Prevent TTS audio from exceeding content duration
        sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)
        if sum_tts > content_ms and sum_tts > 0:
            factor = content_ms / sum_tts
            phrase_audios = [change_playback_speed(a, factor) for a in phrase_audios]
            sum_tts = sum(a.duration_seconds*1000 for a in phrase_audios)

        # Adjust inter-phrase silences to fit within available time
        available = content_ms - sum_tts
        if available <= 0:
            inter_applied = [0]*len(inter_user)
        elif sum(inter_user) <= available:
            inter_applied = inter_user.copy()
        else:
            factor = available / sum(inter_user)
            inter_applied = [int(ms*factor) for ms in inter_user]

        # Rebuild audio sequence with applied silences
        seq = []
        for i, aud in enumerate(phrase_audios):
            seq.append(aud)
            if i < len(inter_applied):
                seq.append(AudioSegment.silent(duration=inter_applied[i]))

        seg_audio = AudioSegment.silent(duration=pre_ms)
        for clip in seq:
            seg_audio += clip
        seg_audio += AudioSegment.silent(duration=post_ms)

        # Trim leading silence to first non-silent audio
        non = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS-16)
        if non:
            seg_audio = seg_audio[non[0][0]:]
        seg_audio = AudioSegment.silent(duration=pre_ms) + seg_audio

        # Pad or trim to exact segment duration
        if len(seg_audio) < total_ms:
            seg_audio += AudioSegment.silent(duration=(total_ms - len(seg_audio)))
        seg_audio = seg_audio[:total_ms]

        # Debug timing info
        non2 = detect_nonsilent(seg_audio, min_silence_len=1, silence_thresh=seg_audio.dBFS-16)
        start_audio_ms = non2[0][0] if non2 else pre_ms
        end_audio_ms   = non2[-1][1] if non2 else (total_ms-post_ms)
        abs_start_a = int(start_s*1000)+start_audio_ms
        abs_end_a   = int(start_s*1000)+end_audio_ms
        abs_start_v = int(start_s*1000)
        abs_end_v   = int(end_s*1000)
        decal_start = abs_start_a - abs_start_v
        decal_end   = abs_end_a   - abs_end_v

        # Global speed warp if needed
        gen_dur = seg_audio.duration_seconds
        diff    = (end_s-start_s) - gen_dur
        if abs(diff) > 0.20:
            seg_audio = change_playback_speed(seg_audio, (end_s-start_s)/gen_dur)

        # Mix on timeline
        start_ms = int(start_s*1000)
        if len(combined) < start_ms:
            combined += AudioSegment.silent(duration=(start_ms-len(combined)))
        combined += seg_audio

        debug.append(
            f"Segment {idx+1} ({start_s:.2f}-{end_s:.2f}s): "
            f"pre={pre_ms}ms, post={post_ms}ms, speed={rate}, "
            f"inter_user={inter_user} ‚Üí inter_applied={inter_applied}, "
            f"d√©cal_start={decal_start}ms, d√©cal_end={decal_end}ms, "
            f"phrases={phrases}\n"
        )

    # Export debug log and final audio
    with open(debug_log_path, "w", encoding="utf-8") as df:
        df.write("Translation Debug Log\n\n")
        df.writelines(debug)
    combined.export(output_audio_path, format="wav")
    return output_audio_path


# ============== Review File Parsing ==============

def parse_review_file(review_file_path):
    text = open(review_file_path, encoding="utf-8").read()
    blocks = [b.strip() for b in re.split(r"(?m)^-{3,}\s*$", text) if b.strip()]
    segments = []
    header_re = re.compile(
        r"Segment\s+\d+\s+\(start:\s*([0-9.]+)s,\s*end:\s*([0-9.]+)s\)", re.I
    )

    for blk in blocks:
        if blk.startswith("Translation Review File"):
            continue
        m = header_re.search(blk)
        if not m:
            continue

        start_s = float(m.group(1))
        end_s   = float(m.group(2))

        orig    = None
        ft      = None
        vs      = "+0%"
        pre     = 0.0
        post    = 100.0
        inter   = []

        for line in blk.splitlines():
            line = line.strip()
            if line.startswith("**Original:**"):
                orig = line.split("**Original:**",1)[1].strip()
            elif line.startswith("**Final Translation:**"):
                ft   = line.split("**Final Translation:**",1)[1].strip()
            elif line.startswith("**Voice Speed:**"):
                vs   = line.split("**Voice Speed:**",1)[1].strip()
            elif line.startswith("**Pre-Silence:**"):
                try:
                    pre = float(line.split("**Pre-Silence:**",1)[1].strip())
                except ValueError:
                    pass
            elif line.startswith("**Post-Silence:**"):
                try:
                    post = float(line.split("**Post-Silence:**",1)[1].strip())
                except ValueError:
                    pass
            elif line.startswith("**Inter-Phrase-Silence:**"):
                vals = line.split("**Inter-Phrase-Silence:**",1)[1].strip()
                if vals:
                    try:
                        raw = [float(x) for x in vals.split(",")]
                        inter = [max(0, min(v, 5000)) for v in raw]
                    except ValueError:
                        inter = []

        if orig is None:
            raise RuntimeError(f"Segment sans **Original** dans {review_file_path}")
        if ft is None:
            ft = orig

        # Improved phrase splitting
        phrases = split_french_phrases(ft)

        segments.append({
            "start_s":               start_s,
            "end_s":                 end_s,
            "original":              orig,
            "final_translation":     ft,
            "voice_speed":           vs,
            "pre_silence":           pre,
            "post_silence":          post,
            "inter_phrase_silences": inter,
            "phrases":               phrases
        })

    print(f"‚úÖ Parsed {len(segments)} segments from review file.")
    return segments


# ============== Placeholder for generate_translation_review_file (must be implemented) ==============


def generate_translation_review_file(
    source_path, review_file_path,
    from_lang="en", to_lang="fr",
    max_group_duration_secs: float = 25.0
):
    """
    1) Regroupe les sous-titres par phrases.
    2) Scinde les groupes trop longs.
    3) √âcrit le fichier de review en indiquant :
       - pre / post silence
       - budget total pour inter-phrases
       - inter-phrase-silence par d√©faut (0 pour chaque intervalle)
    """
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)

    # 1) Regroupement par phrase
    sentence_end = re.compile(r"[.!?]\s*$")
    groups, current = [], []
    for sub in subs:
        current.append(sub)
        if sentence_end.search(sub.text):
            groups.append(current)
            current = []
    if current:
        groups.append(current)

    # 2) D√©coupe si trop long & enforcement ponctuation
    from itertools import chain
    def split_long_groups(groups, max_s):
        new = []
        for g in groups:
            start, end = g[0].start.ordinal/1000, g[-1].end.ordinal/1000
            if end-start <= max_s:
                new.append(g)
            else:
                # simple d√©coupe au milieu
                mid = len(g)//2
                new.extend([g[:mid], g[mid:]])
        return new
    groups = split_long_groups(groups, max_group_duration_secs)

    def enforce_punctuation_boundaries(groups):
        i = 0
        while i < len(groups):
            last = groups[i][-1].text.strip()
            if not re.search(r"[.!?,;:]$", last):
                if i+1 < len(groups):
                    groups[i] += groups.pop(i+1)
                else:
                    groups[i][-1].text += "."
            else:
                i += 1
        return groups
    groups = enforce_punctuation_boundaries(groups)

    # 3) Write review file
    with open(review_file_path, "w", encoding="utf-8") as f:
        f.write("Translation Review File\n")
        f.write("Edit the **Final Translation** below. You can also adjust:\n")
        f.write("- **Voice Speed:** -10% to +10%\n")
        f.write("- **Pre-Silence/Post-Silence:** in milliseconds\n")
        f.write("- **Inter-Phrase-Silence:** comma-separated ms between phrases\n")
        f.write("  (must have one fewer value than phrases)\n")
        f.write("----------------------------------------------------------------\n\n")

        for idx, group in enumerate(groups, 1):
            start_s = group[0].start.ordinal/1000
            end_s   = group[-1].end.ordinal/1000
            original = " ".join(s.text for s in group)
            auto_tr  = translator.translate(text=original)

            # d√©coupe en phrases
            phrases = re.split(r"(?<=[.!?])\s+(?=[A-Z√Ä√Ç√â√à√ä√ã√é√è√î≈í√ô√õ√ú])", auto_tr)
            phrases = [p.strip() for p in phrases if p.strip()]

            # budget inter-phrases
            total_ms      = int((end_s - start_s)*1000)
            pre_ms, post_ms = 100, 100
            budget_inter  = max(0, total_ms - pre_ms - post_ms)

            # silences par d√©faut = 0
            inter_default = ",".join("0" for _ in range(len(phrases)-1))

            # on √©crit
            f.write(f"Segment {idx} (start: {start_s:.2f}s, end: {end_s:.2f}s):\n")
            f.write(f"**Original:** {original}\n")
            f.write(f"**Auto Translated:** {auto_tr}\n")
            f.write(f"**Final Translation:** {auto_tr}\n")
            f.write(f"**Voice Speed:** +0%\n")
            f.write(f"**Pre-Silence:** {pre_ms}\n")
            f.write(f"**Post-Silence:** {post_ms}\n")
            f.write(f"**Budget total Inter-Phrase-Silence (ms):** {budget_inter}\n")
            f.write(f"**Inter-Phrase-Silence:** {inter_default}\n")
            f.write("----------------------------------------------------------------\n\n")

    print(f"‚úÖ Review file created at: {review_file_path}  ({len(groups)} segments)")
    input("Type 'Y' when ready to continue: ")
    return groups


def adjust_review_file_based_on_debug_log(
    review_path: str,
    debug_log_path: str,
    output_review_path: str,
    threshold_ms: int = 50
):
    """
    Pour chaque segment i :
      - si decal_end < -threshold_ms ‚Üí audio trop court ‚Üí on ajoute |decal_end|ms en post-silence
      - si decal_start >  threshold_ms ‚Üí audio trop t√¥t  ‚Üí on ajoute decal_start ms en pre-silence
    R√©√©crit un nouveau review_file.
    """
    # 1) on lit tout
    review_txt = open(review_path, encoding="utf-8").read()
    debug_txt  = open(debug_log_path, encoding="utf-8").read()

    # 2) on r√©cup√®re les d√©calages par segment
    #    ex: "Segment 3 (‚Ä¶): ‚Ä¶ d√©calage_end=-1686ms"
    decal_ends = [int(m) for m in re.findall(r"Segment\s+(\d+).*?d√©calage_end=([\-0-9]+)ms", debug_txt)]
    decal_starts = [int(m) for m in re.findall(r"Segment\s+(\d+).*?d√©calage_start=([\-0-9]+)ms", debug_txt)]
    # on met dans dict {seg_index: (start,end)}
    shifts = {}
    for (i,end), (_,start) in zip(decal_ends, decal_starts):
        shifts[i] = {"start": start, "end": end}

    # 3) on rebuild le review file en ins√©rant les ajustements
    out_lines = []
    current_seg = 0
    for line in review_txt.splitlines(keepends=True):
        m = re.match(r"Segment\s+(\d+)\s+\(start:", line)
        if m:
            # nouveau segment
            current_seg = int(m.group(1))
            end = shifts.get(current_seg,{}).get("end", 0)
            start = shifts.get(current_seg,{}).get("start", 0)
            out_lines.append(line)
            continue

        # si on est sur la ligne **Pre-Silence:**
        if line.startswith("**Pre-Silence:**") and current_seg in shifts:
            start, end = shifts[current_seg]["start"], shifts[current_seg]["end"]
            add = start if start>threshold_ms else 0
            # on lit l'ancienne valeur
            old = float(re.search(r"\*\*Pre-Silence:\*\*\s*([\d.]+)", line).group(1))
            new = old + add
            line = re.sub(r"\*\*Pre-Silence:\*\*\s*[\d.]+", f"**Pre-Silence:** {new}", line)
            out_lines.append(line)
            continue

        # si on est sur la ligne **Post-Silence:**
        if line.startswith("**Post-Silence:**") and current_seg in shifts:
            start, end = shifts[current_seg]["start"], shifts[current_seg]["end"]
            add = abs(end) if end< -threshold_ms else 0
            old = float(re.search(r"\*\*Post-Silence:\*\*\s*([\d.]+)", line).group(1))
            new = old + add
            line = re.sub(r"\*\*Post-Silence:\*\*\s*[\d.]+", f"**Post-Silence:** {new}", line)
            out_lines.append(line)
            continue

        out_lines.append(line)

    # 4) on √©crit le nouveau review file
    with open(output_review_path, "w", encoding="utf-8") as f:
        f.write("".join(out_lines))

    print(f"‚úÖ Adjusted review written to {output_review_path}")


# ============== Main Entry Point ==============

async def main():
    print("Extracting audio...")
    extract_audio()

    print("Transcribing audio...")
    language, segments = transcribe(extracted_audio)

    print("Generating subtitle file...")
    generate_subtitle_file(segments, subtitle_file_en)

    print("Generating translated audio with sync...")
    await async_generate_translated_audio_with_sync_using_review(
        subtitle_file_en,
        translated_audio,
        debug_log_file,
        review_file
    )

    print("Done!")

if __name__ == "__main__":
    asyncio.run(main())


In [18]:
import time
from pydub import AudioSegment
from gtts import gTTS
import tempfile
import os

# Segment 3 metadata
segment_id = 3
start_sec = 41.80
end_sec = 101.72
video_duration = end_sec - start_sec  # ~59.92 seconds

final_translation = (
    "√Ä partir de la page d'accueil, cliquez sur le navigateur et acc√©dez aux dimensions. "
    "Maintenant, je vais s√©lectionner le cube, et pour la dimension du compte, je vais rechercher "
    "le membre pour lequel je souhaite mettre √† jour la formule du membre. Une fois que le membre est mis en √©vidence, "
    "nous pouvons modifier les propri√©t√©s de formule de membres. Nous pouvons s√©lectionner le cube pour voir la formule du membre. "
    "La formule des membres est utilis√©e n'importe o√π dans l'application. Ici, le mat√©riel OFS sera divis√© par le volume OFS. "
    "Une fois que nous avons la formule des membres, nous pouvons la valider et nous pouvons voir qu'elle est valid√©e avec succ√®s. "
    "Nous pouvons enregistrer, r√©initialiser ou annuler. Je vais annuler dans ce cas."
)

inter_phrase_silences = [1000, 1000, 1000, 500, 500, 300, 250, 150]  # in ms
pre_silence_ms = 0
post_silence_ms = 100
voice_speed = "-5%"  # speed not yet handled in gTTS, but can be slowed using audio stretch if needed
expected_offset_ms = 100  # declared "D√©calage"

# Step 1: Split text into phrases (by periods)
phrases = [phrase.strip() for phrase in final_translation.split('.') if phrase.strip()]

# Step 2: Generate and stitch audio
tts_segments = []

for idx, phrase in enumerate(phrases):
    print(f"[TTS] Synthesizing phrase {idx+1}/{len(phrases)}: {phrase}")
    tts = gTTS(text=phrase, lang='fr')
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tf:
        tmp_path = tf.name
        tts.save(tmp_path)

    # Load it after closing
    audio = AudioSegment.from_file(tmp_path, format="mp3")
    os.unlink(tmp_path)


    tts_segments.append(audio)

    # Add inter-phrase silence if not the last segment
    if idx < len(phrases) - 1:
        silence = AudioSegment.silent(duration=inter_phrase_silences[idx] if idx < len(inter_phrase_silences) else 0)
        tts_segments.append(silence)

# Step 3: Add pre- and post-silence
final_audio = AudioSegment.silent(duration=pre_silence_ms)
for seg in tts_segments:
    final_audio += seg
final_audio += AudioSegment.silent(duration=post_silence_ms)

# Step 4: Export and analyze duration
output_path = f"segment_{segment_id}_test_audio.mp3"
final_audio.export(output_path, format="mp3")

# Duration in seconds
audio_duration_sec = len(final_audio) / 1000.0
decalage_sec = audio_duration_sec - video_duration

print("\nüîç Sync Analysis for Segment 3")
print(f"üïò Video segment duration   : {video_duration:.2f} seconds")
print(f"üîä Generated audio duration : {audio_duration_sec:.2f} seconds")
print(f"üß≠ D√©calage (audio - video) : {decalage_sec:+.2f} seconds")
print(f"üìÅ Output audio saved at    : {output_path}")


[TTS] Synthesizing phrase 1/9: √Ä partir de la page d'accueil, cliquez sur le navigateur et acc√©dez aux dimensions
[TTS] Synthesizing phrase 2/9: Maintenant, je vais s√©lectionner le cube, et pour la dimension du compte, je vais rechercher le membre pour lequel je souhaite mettre √† jour la formule du membre
[TTS] Synthesizing phrase 3/9: Une fois que le membre est mis en √©vidence, nous pouvons modifier les propri√©t√©s de formule de membres
[TTS] Synthesizing phrase 4/9: Nous pouvons s√©lectionner le cube pour voir la formule du membre
[TTS] Synthesizing phrase 5/9: La formule des membres est utilis√©e n'importe o√π dans l'application
[TTS] Synthesizing phrase 6/9: Ici, le mat√©riel OFS sera divis√© par le volume OFS
[TTS] Synthesizing phrase 7/9: Une fois que nous avons la formule des membres, nous pouvons la valider et nous pouvons voir qu'elle est valid√©e avec succ√®s
[TTS] Synthesizing phrase 8/9: Nous pouvons enregistrer, r√©initialiser ou annuler
[TTS] Synthesizing phrase 9/9

In [1]:
import asyncio
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import edge_tts
import tempfile
import os

# Configuration - matches your review file example
SEGMENTS = [
    {   # Segment 1
        "text": "Nous allons voir les configurations de l'application EPM. Nous verrons comment cr√©er un r√¥le commercial ou une formule membre. Nous verrons comment la s√©curit√© fonctionne dans l'application EPM et nous couvrirons comment cr√©er et configurer des formulaires de donn√©es. La s√©curit√© dans l'EPM comprendra la s√©curit√© dimensionnelle, la s√©curit√© des artefacts,",
        "pre_silence": 0,
        "post_silence": 0,
        "speed": "+0%"
    },
    {   # Segment 2
        "text": "S√©curit√© par t√¢ches ou flux de travail, s√©curit√© pour les r√¥les commerciaux et s√©curit√© pour les donn√©es.",
        "pre_silence": 0,
        "post_silence": 100,
        "speed": "+0%"
    }
]

async def generate_segment(text: str, speed: str) -> AudioSegment:
    """Generate TTS audio with aggressive silence trimming"""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        communicate = edge_tts.Communicate(
            text=text,
            voice="fr-FR-DeniseNeural",
            rate=speed
        )
        await communicate.save(tmp.name)
        audio = AudioSegment.from_mp3(tmp.name)
        os.unlink(tmp.name)
        
        # Trim both edges aggressively
        non_silent = detect_nonsilent(audio, silence_thresh=-40, min_silence_len=50)
        if non_silent:
            start = non_silent[0][0]
            end = non_silent[-1][1]
            return audio[start:end]
        return audio

async def main():
    # Generate and process segments
    processed = []
    for idx, seg in enumerate(SEGMENTS):
        # Create silence pads
        pre = AudioSegment.silent(duration=seg["pre_silence"])
        post = AudioSegment.silent(duration=seg["post_silence"])
        
        # Generate TTS audio
        speech = await generate_segment(seg["text"], seg["speed"])
        
        # Combine elements
        full = pre + speech + post
        full.export(f"segment_{idx+1}.wav", format="wav")
        processed.append(full)
        print(f"Generated Segment {idx+1} ({len(full)}ms)")

    # Combine segments and analyze transition
    combined = processed[0] + processed[1]
    combined.export("combined_output.wav", format="wav")
    
    # Detect transition point
    transition_point = len(processed[0])
    samples = combined[transition_point-500:transition_point+500]  # 1s window
    
    # Find first speech after transition
    non_silent = detect_nonsilent(samples, silence_thresh=-40, min_silence_len=20)
    if non_silent:
        gap = non_silent[0][0] - 500  # Adjust for window offset
        print(f"\nTransition analysis:")
        print(f"Segment 1 ends at: {transition_point}ms")
        print(f"First speech in Segment 2 starts at: {transition_point + gap}ms")
        print(f"Actual gap between segments: {abs(gap)}ms")
    else:
        print("\nNo gap detected between segments")

if __name__ == "__main__":
    try:
        # Standard execution
        asyncio.run(main())
    except RuntimeError:
        # For Jupyter/notebook environments
        import nest_asyncio
        nest_asyncio.apply()
        asyncio.get_event_loop().run_until_complete(main())

PermissionError: [WinError 32] Le processus ne peut pas acc√©der au fichier car ce fichier est utilis√© par un autre processus: 'C:\\Users\\061181~1\\AppData\\Local\\Temp\\tmptkseujan.mp3'

In [None]:
import edge_tts

ssml_text = '''<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="fr-FR">
    <prosody rate="120%">Bonjour, comment allez-vous?</prosody>
    <break time="300ms"/>
    <prosody rate="100%">Je suis ravi de vous voir.</prosody>
    </speak>'''

# Use the synchronous version
edge_tts.Communicate(ssml_text, voice="fr-FR-DeniseNeural").save_sync("output.mp3")
print("‚úÖ Done!")

‚úÖ Done!


In [9]:
# First install if needed (remove if already installed)
!pip install edge-tts --upgrade

# Then run this cell:
import edge_tts

# PROPERLY FORMATTED SSML (no extra whitespace between tags)
ssml_text = (
    '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="fr-FR">'
    '<prosody rate="120%">Bonjour, comment allez-vous?</prosody>'
    '<break time="300ms"/>'
    '<prosody rate="100%">Je suis ravi de vous voir.</prosody>'
    '</speak>'
)

# For Jupyter Notebooks - use the synchronous version
edge_tts.Communicate(ssml_text, voice="fr-FR-DeniseNeural").save_sync("output.mp3")

print("‚úÖ Synthesis complete! Playing result...")

# Play the audio directly in notebook
from IPython.display import Audio
Audio("output.mp3")

‚úÖ Synthesis complete! Playing result...


In [4]:
import re

def parse_segments(content):
    """
    Parse the content into segments using a regular expression that captures each segment.
    Returns a dictionary with segment numbers (as integers) as keys and the full segment text as values.
    """
    segments = {}
    # This pattern finds segments that start with "Segment <number>" and goes until the next segment or end of file.
    pattern = r"(Segment\s+(\d+).*?)(?=^Segment\s+\d+|\Z)"
    matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
    for full_seg, seg_no in matches:
        segments[int(seg_no)] = full_seg
    return segments

def get_final_translation(segment_text):
    """
    Extracts the final translation text from a segment.
    It looks for the pattern **Final Translation:** followed by any text.
    """
    match = re.search(r"\*\*Final Translation:\*\*\s*(.*)", segment_text)
    if match:
        return match.group(1).strip()
    return None

def update_translation_review(review_content, updated_segments):
    """
    For each segment in the review file's content, replaces the **Final Translation:** text
    with the one from the updated segments.
    """
    # Parse the review content into segments using the same parser
    review_segments = {}
    pattern = r"(Segment\s+(\d+).*?)(?=^Segment\s+\d+|\Z)"
    matches = re.findall(pattern, review_content, re.DOTALL | re.MULTILINE)
    
    updated_content = review_content  # Work on a copy of the content

    for full_seg, seg_no in matches:
        seg_no_int = int(seg_no)
        if seg_no_int in updated_segments:
            updated_final = get_final_translation(updated_segments[seg_no_int])
            if updated_final is not None:
                # Replace the **Final Translation:** line in this segment with the new translation.
                # This regex pattern matches the '**Final Translation:**' line and captures the prefix.
                new_seg = re.sub(
                    r"(\*\*Final Translation:\*\*\s*).*",  # match the line starting with **Final Translation:**
                    r"\1" + updated_final,  # replace with the captured prefix plus the updated text
                    full_seg
                )
                # Replace the old segment in the content with the updated segment.
                updated_content = updated_content.replace(full_seg, new_seg)

    return updated_content

def main():
    # Define file paths (make sure these paths are correct)
    updated_file_path = "to translate/translation_review_latest_seg11_33_speed_moins10%.txt"
    review_file_path = "to translate/translation_review.txt"
    
    # Read updated file content
    with open(updated_file_path, encoding='utf-8') as f:
        updated_content = f.read()
    
    # Read review file content
    with open(review_file_path, encoding='utf-8') as f:
        review_content = f.read()
    
    # Parse segments from the updated content
    updated_segments = parse_segments(updated_content)
    
    # Replace the **Final Translation:** values in the review content
    new_review_content = update_translation_review(review_content, updated_segments)
    
    # Write the updated content to a new file (or overwrite the original file if desired)
    output_file_path = "to translate/translation_review_updated.txt"
    with open(output_file_path, "w", encoding='utf-8') as f:
        f.write(new_review_content)
    
    print(f"Updated translation review file has been saved as {output_file_path}")

if __name__ == "__main__":
    main()


Updated translation review file has been saved as to translate/translation_review_updated.txt


REPLACE ALL SECTIONS

In [1]:
import re

def parse_segments(content):
    """
    Parse the content into segments based on the segment header.
    Returns a dictionary with segment numbers (as integers) as keys and the full segment text as values.
    """
    segments = {}
    # The pattern finds segments that start with "Segment <number>" until the next segment or end of file.
    pattern = r"(Segment\s+(\d+).*?)(?=^Segment\s+\d+|\Z)"
    matches = re.findall(pattern, content, re.DOTALL | re.MULTILINE)
    for full_seg, seg_no in matches:
        segments[int(seg_no)] = full_seg
    return segments

def get_field_text(segment_text, field_name):
    """
    Extract the text following a given field marker (e.g., **Original:**, **Auto Translated:**, **Final Translation:**)
    from the segment text. Returns the text stripped of leading/trailing spaces.
    """
    # Using a regex pattern that stops at the end of the line.
    pattern = r"\*\*" + re.escape(field_name) + r":\*\*\s*(.*)"
    match = re.search(pattern, segment_text)
    if match:
        return match.group(1).strip()
    return None

def update_segment_fields(review_segment, updated_segment, fields):
    """
    Replace the specified fields in the review_segment with the ones extracted
    from the updated_segment.
    
    fields should be a list of field names such as ["Original", "Auto Translated", "Final Translation"].
    """
    updated_seg = review_segment
    for field in fields:
        new_text = get_field_text(updated_segment, field)
        if new_text is not None:
            # Replace the entire line for the field in the review segment.
            # The regex captures the marker and then replaces what follows.
            pattern = r"(\*\*" + re.escape(field) + r":\*\*\s*).*$"
            replacement = r"\1" + new_text
            updated_seg = re.sub(pattern, replacement, updated_seg, flags=re.MULTILINE)
    return updated_seg

def update_translation_review(review_content, updated_segments, fields):
    """
    For each segment in review_content, replace the specified fields with those from the updated segments.
    """
    # Find the segments in review content using the same segmentation parser.
    pattern = r"(Segment\s+(\d+).*?)(?=^Segment\s+\d+|\Z)"
    matches = re.findall(pattern, review_content, re.DOTALL | re.MULTILINE)

    updated_content = review_content  # Work on a copy to allow replacements.

    for full_seg, seg_no in matches:
        seg_no_int = int(seg_no)
        if seg_no_int in updated_segments:
            # Replace the fields (Original, Auto Translated, Final Translation) based on updated segments.
            new_seg = update_segment_fields(full_seg, updated_segments[seg_no_int],
                                            fields)
            # Replace the old segment in the overall content with the updated segment.
            updated_content = updated_content.replace(full_seg, new_seg)
    return updated_content

def main():
    # Define the file paths (adjust these paths as needed)
    updated_file_path = "to translate/translation_review_version-5%speed.txt"
    review_file_path = "to translate/translation_review.txt"
    
    # Read the updated file content
    with open(updated_file_path, encoding='utf-8') as f:
        updated_content = f.read()
    
    # Read the review file content
    with open(review_file_path, encoding='utf-8') as f:
        review_content = f.read()
    
    # Parse segments from the updated file content
    updated_segments = parse_segments(updated_content)
    
    # Define the list of fields to update.
    fields_to_update = ["Original", "Auto Translated", "Final Translation"]
    
    # Replace the fields in the review content using the updated segments
    new_review_content = update_translation_review(review_content, updated_segments,
                                                   fields_to_update)
    
    # Write the updated content to a new file (or overwrite the original file if desired)
    output_file_path = "to translate/translation_review_updated.txt"
    with open(output_file_path, "w", encoding='utf-8') as f:
        f.write(new_review_content)
    
    print(f"Updated translation review file has been saved as {output_file_path}")

if __name__ == "__main__":
    main()


Updated translation review file has been saved as to translate/translation_review_updated.txt


In [None]:
import whisper
from transformers import pipeline
from TTS.api import TTS
import moviepy.editor as mp
from pydub import AudioSegment
import numpy as np

def translate_video(input_path, output_path):
    # 1. Extract audio
    video = mp.VideoFileClip(input_path)
    audio_path = "temp_audio.wav"
    video.audio.write_audiofile(audio_path)

    # 2. Transcribe with Whisper (large model for accuracy)
    model = whisper.load_model("large-v3")
    result = model.transcribe(audio_path, language="en")
    segments = result["segments"]

    # 3. Translate with NLLB (state-of-the-art translation)
    translator = pipeline("translation", 
                        model="facebook/nllb-200-3.3B",
                        src_lang="eng_Latn", 
                        tgt_lang="fra_Latn")
    
    # Context-aware translation with sentence grouping
    translated_text = []
    current_group = ""
    for seg in segments:
        if len(current_group) + len(seg['text']) < 500:
            current_group += " " + seg['text']
        else:
            translated_group = translator(current_group)[0]['translation_text']
            translated_text.append(translated_group)
            current_group = seg['text']
    if current_group:
        translated_text.append(translator(current_group)[0]['translation_text'])

    # 4. Generate French speech with natural flow
    tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2",
             progress_bar=False, gpu=True)
    
    # Split translation into natural speech chunks
    full_audio = AudioSegment.silent(duration=0)
    for i, text in enumerate(translated_text):
        tts.tts_to_file(text=text, 
                       speaker_wav="fr_speaker_ref.wav",  # Provide reference audio
                       language="fr",
                       file_path=f"temp_{i}.wav")
        
        chunk = AudioSegment.from_wav(f"temp_{i}.wav")
        full_audio += chunk

    # 5. Synchronize with video
    new_audio = mp.AudioFileClip("final_audio.wav")
    final_video = video.set_audio(new_audio.set_duration(video.duration))
    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")

# Usage
translate_video("4.2.4_Configuration de la solution_Avr_10_Latest.mp4", "4.2.4_Configuration de la solution_Avr_10_Latest_fr.mp4")


In [None]:
import subprocess
import os

def translate_video_subdub(input_path, output_path, speaker_ref):
    """
    Full translation pipeline using Subdub (OSS)
    Requires Subdub installation and WhisperX/XTTS setup
    """
    # Clone Subdub repo if not exists
    if not os.path.exists("Subdub"):
        subprocess.run(["git", "clone", "https://github.com/lukaszliniewicz/Subdub.git"])
    
    # Install requirements (first time only)
    if not os.path.exists("Subdub/venv"):
        subprocess.run(["python", "-m", "venv", "Subdub/venv"])
        pip_cmd = "Subdub/venv/Scripts/pip" if os.name == "nt" else "Subdub/venv/bin/pip"
        subprocess.run([pip_cmd, "install", "-r", "Subdub/requirements.txt"])

    # Build Subdub command
    python_cmd = "Subdub/venv/Scripts/python" if os.name == "nt" else "Subdub/venv/bin/python"
    cmd = [
        python_cmd, "Subdub/Subdub.py",
        "-i", input_path,
        "-sl", "en", "-tl", "fr",
        "-task", "full",
        "-tts_voice", speaker_ref,
        "-whisper_model", "large-v3"
    ]
    
    # Run translation pipeline
    subprocess.run(cmd)
    
    # Move output file
    base_name = os.path.splitext(input_path)[0]
    subdub_output = f"{base_name}_dubbed.mp4"
    os.replace(subdub_output, output_path)

# Usage
translate_video_subdub(
    "4.2.4_Configuration de la solution_Avr_10_Latest.mp4",
    "4.2.4_Configuration de la solution_Avr_10_Latest_fr.mp4",
    "fr_speaker_ref.wav"
)


In [10]:
# -*- coding: utf-8 -*-
import os, re
import time
import math
import ffmpeg
from faster_whisper import WhisperModel
import pysrt
from translate import Translator
from gtts import gTTS
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
import whisper
from shutil import which

# --- Configuration ---
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"‚úÖ ffmpeg found at: {ffmpeg_path}")

# Configuration
input_video = "4.2.4_Configuration de la solution_Avr_10_Latest.mp4"  # Path to your input video
output_dir = "output"       # Output directory
model_size = "small"        # Whisper model size (tiny, base, small, medium, large)

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Derived paths
input_video_name = os.path.splitext(os.path.basename(input_video))[0]
extracted_audio = os.path.join(output_dir, f"{input_video_name}-extracted-audio.wav")
subtitle_file_en = os.path.join(output_dir, f"{input_video_name}-english.srt")
subtitle_file_fr = os.path.join(output_dir, f"{input_video_name}-french.srt")
translated_audio = os.path.join(output_dir, f"{input_video_name}-french.wav")
output_video = os.path.join(output_dir, f"{input_video_name}-french.mp4")

def extract_audio():
    """Extract audio from video using ffmpeg"""
    try:
        (
            ffmpeg
            .input(input_video)
            .output(extracted_audio, ac=1, ar=16000)  # Convert to mono 16kHz
            .overwrite_output()
            .run(capture_stdout=True, capture_stderr=True)
        )
        return extracted_audio
    except ffmpeg.Error as e:
        print('STDOUT:', e.stdout.decode('utf8'))
        print('STDERR:', e.stderr.decode('utf8'))
        raise
def transcribe(audio_path):
    """Transcribe audio using faster-whisper."""
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    segments, info = model.transcribe(audio_path, beam_size=5)

    # Access the language attribute directly from the TranscriptionInfo object
    language = info.language
    print(f"Detected language: {language}")

    # Process the segments into a list
    transcript_segments = []
    for segment in segments:
        transcript_segments.append({
            "start": segment.start,
            "end": segment.end,
            "text": segment.text.strip()
        })

    return language, transcript_segments

def generate_subtitle_file(segments, output_path):
    """Generate subtitle file from segments."""
    subs = pysrt.SubRipFile()
    
    for i, segment in enumerate(segments, 1):
        sub = pysrt.SubRipItem(
            index=i,
            start=time_to_subrip(segment["start"]),  # Access dictionary keys
            end=time_to_subrip(segment["end"]),      # Access dictionary keys
            text=segment["text"].strip()             # Access dictionary keys
        )
        subs.append(sub)
    
    subs.save(output_path, encoding='utf-8')
    return output_path

def time_to_subrip(seconds: float) -> pysrt.SubRipTime:
    """Convert seconds to SubRipTime format"""
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return pysrt.SubRipTime(
        hours=hours,
        minutes=minutes,
        seconds=int(seconds),
        milliseconds=milliseconds
    )

def translate_subtitlesOLD(source_path, target_path, from_lang="en", to_lang="fr"):
    """Translate subtitles using translate library"""
    translator = Translator(to_lang=to_lang, from_lang=from_lang)
    subs = pysrt.open(source_path)
    
    for sub in subs:
        try:
            translated = translator.translate(sub.text)
            sub.text = translated
        except Exception as e:
            print(f"Translation error: {str(e)}")
            continue
    
    subs.save(target_path, encoding='utf-8')
    return target_path



from deep_translator import GoogleTranslator
import pysrt
# --- Configuration ---
update_existing = True  # Set to True to allow user updates of the transcript/translation

def translate_subtitles(source_path, target_path, from_lang="en", to_lang="fr"):
    """Translate subtitles using GoogleTranslator from deep_translator."""
    translator = GoogleTranslator(source=from_lang, target=to_lang)
    subs = pysrt.open(source_path)
    
    for sub in subs:
        try:
            # Translate using deep_translator's GoogleTranslator
            translated = translator.translate(text=sub.text)
            
            # Allow user to update translation manually if desired.
            if update_existing:
                print(f"\nSegment {sub.index}:")
                print(f"**Original:** {sub.text}")
                print(f"**Auto Translated:** {translated}")
                user_input = input("Press ENTER to accept or type your corrected translation: ").strip()
                if user_input:
                    translated = user_input
            
            sub.text = translated
        except Exception as e:
            print(f"Translation error in segment {sub.index}: {str(e)}")
            continue
    
    subs.save(target_path, encoding='utf-8')
    return target_path





def generate_translated_audio(subtitle_path, output_audio_path):
    """Generate translated audio track with timing"""
    subs = pysrt.open(subtitle_path)
    combined = AudioSegment.silent(duration=0)

    for sub in subs:
        start_time = sub.start.ordinal / 1000  # Convert to seconds
        text = sub.text

        # Generate TTS
        tts = gTTS(text, lang='fr')
        temp_path = os.path.join(output_dir, "temp.mp3")
        tts.save(temp_path)

        # Load and process audio
        audio = AudioSegment.from_mp3(temp_path)
        current_duration = len(combined)
        required_start = start_time * 1000  # Convert to milliseconds

        if required_start > current_duration:
            silence = AudioSegment.silent(duration=required_start - current_duration)
            combined += silence

        combined += audio
        os.remove(temp_path)  # Clean up temporary file

    combined.export(output_audio_path, format="wav")
    return output_audio_path



def merge_audio_video():
    """Merge translated audio with original video."""
    try:
        print("Merging audio with video...")
        video = VideoFileClip(input_video)
        audio = AudioSegment.from_file(translated_audio)  # Use pydub to load the audio

        # Set audio duration to match video duration
        video_duration_ms = int(video.duration * 1000)  # Convert video duration to milliseconds
        audio_duration_ms = len(audio)  # Get audio duration in milliseconds

        if audio_duration_ms < video_duration_ms:
            # Add silence to match the video duration
            silence = AudioSegment.silent(duration=(video_duration_ms - audio_duration_ms))
            audio += silence

        # Export the adjusted audio to a temporary file
        adjusted_audio_path = "adjusted_audio.wav"
        audio.export(adjusted_audio_path, format="wav")

        # Replace the video's audio with the adjusted audio
        adjusted_audio = AudioFileClip(adjusted_audio_path)
        video = video.set_audio(adjusted_audio)

        # Write the final video file
        video.write_videofile(
            output_video,
            codec="libx264",
            audio_codec="aac",
            temp_audiofile="temp-audio.m4a",
            remove_temp=True,
            threads=4
        )

        # Clean up temporary audio file
        os.remove(adjusted_audio_path)

        print(f"Process completed! Output video: {output_video}")
        return output_video

    except Exception as e:
        print(f"Failed to merge audio with video: {e}")
        raise


def group_subtitles_by_sentence(subs):
    """
    Group subtitles so that consecutive segments without sentence ending punctuation
    are combined. This returns a list of groups, each group is a list of subtitles.
    """
    groups = []
    current_group = []

    # A simple pattern to check for sentence-ending punctuation.
    sentence_end_pattern = re.compile(r'[.!?]\s*$')

    for sub in subs:
        current_group.append(sub)
        # If text ends with punctuation, consider the group complete.
        if sentence_end_pattern.search(sub.text):
            groups.append(current_group)
            current_group = []
    # Add any remaining subtitles as a group.
    if current_group:
        groups.append(current_group)
    return groups

def generate_translated_audio_grouped(subtitle_path, output_audio_path):
    """Generate a translated audio track by grouping subtitles by sentence."""
    subs = pysrt.open(subtitle_path)
    grouped_subs = group_subtitles_by_sentence(subs)
    combined_audio = AudioSegment.silent(duration=0)

    for group in grouped_subs:
        # Determine the start time based on the first subtitle in the group.
        group_start_time = group[0].start.ordinal / 1000  # in seconds
        # Merge texts from all subtitles in the group.
        group_text = " ".join([sub.text for sub in group])
        
        # Generate TTS for the grouped text
        tts = gTTS(group_text, lang='fr')
        temp_path = os.path.join(output_dir, "temp.mp3")
        tts.save(temp_path)
        audio = AudioSegment.from_mp3(temp_path)
        os.remove(temp_path)  # Cleanup temporary file

        # Calculate required start time (in milliseconds)
        required_start = int(group_start_time * 1000)
        current_duration = len(combined_audio)
        
        # Insert silence if the current combined audio is shorter than the required start time.
        if required_start > current_duration:
            silence = AudioSegment.silent(duration=required_start - current_duration)
            combined_audio += silence
        
        # Append the generated audio
        combined_audio += audio

    combined_audio.export(output_audio_path, format="wav")
    return output_audio_path













if __name__ == "__main__":
    # Step 1: Extract audio
    print("Extracting audio...")
    audio_path = extract_audio()
    
    # Step 2: Transcribe audio
    print("Transcribing audio...")
    language, segments = transcribe(audio_path)
    
    # Step 3: Generate English subtitles
    print("Generating English subtitles...")
    generate_subtitle_file(segments, subtitle_file_en)
    
    # Step 4: Translate to French
    print("Translating subtitles...")
    translate_subtitles(subtitle_file_en, subtitle_file_fr)
    
   
        # Step 5: Generate French audio with improved grouping
    print("Generating French audio (grouped by sentences)...")
    generate_translated_audio_grouped(subtitle_file_fr, translated_audio)
    
    # Step 6: Merge audio and video
    print("Merging audio and video...")
    merge_audio_video()
    
    print(f"Process completed! Output video: {output_video}")

‚úÖ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
Extracting audio...
Transcribing audio...
Detected language: en
Generating English subtitles...
Translating subtitles...

Segment 1:
**Original:** will take a look at the configurations in EPM application. We will see how to
**Auto Translated:** Je vais jeter un ≈ìil aux configurations de l'application EPM. Nous verrons comment

Segment 2:
**Original:** create a business role or a member formula. We will see how the security
**Auto Translated:** cr√©er un r√¥le commercial ou une formule membre. Nous verrons comment la s√©curit√©

Segment 3:
**Original:** works in the EPM application and we will cover how to create and configure
**Auto Translated:** Fonctionne dans l'application EPM et nous couvrirons comment cr√©er et configurer

Segment 4:
**Original:** data forms. Security in EPM will include dimensional security, artifact security,
**Auto Translated:** formulaires de donn√©es. La s√©curit√© dans l'EPM comprendra la s√©curit√© dimensionne

                                                                        

MoviePy - Done.
Moviepy - Writing video output\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4



                                                                   

Moviepy - Done !
Moviepy - video ready output\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: output\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
Process completed! Output video: output\4.2.4_Configuration de la solution_Avr_10_Latest-french.mp4
