In [None]:
import re

def remove_bold_phrases_until_separator(input_file, output_file):
    """
    Reads a translation review file, removes the "**Phrases:**" section (including the bold formatting)
    and its content until the next separator line, and writes the modified content to a new file.

    Args:
        input_file (str): Path to the input translation review file.
        output_file (str): Path to the output file without "**Phrases:**" sections.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as infile:
            content = infile.read()

        # Use regex to find and remove the "**Phrases:**" block until the separator
        modified_content = re.sub(r"\*\*Phrases:\*\*\n(?:- .*\n)*?(?=\n-{80,})", "", content, flags=re.DOTALL)

        with open(output_file, 'w', encoding='utf-8') as outfile:
            outfile.write(modified_content)

        print(f"Successfully processed '{input_file}'. The output has been written to '{output_file}'.")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    input_filename = "translation_review.txt"
    output_filename = "translation_review_modified.txt"
    remove_bold_phrases_until_separator(input_filename, output_filename)

In [None]:
import os
import re
import asyncio
import nest_asyncio
import edge_tts
import whisper
from shutil import which
from pydub import AudioSegment
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.audio.io.AudioFileClip import AudioFileClip
from tempfile import NamedTemporaryFile
from deep_translator import GoogleTranslator
import ollama
from datetime import datetime
import glob
import shutil
import gc
import openai
import time # Import time
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# --- Configuration ---
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

SEGMENTS_DIR = "segments_temp"
os.makedirs(SEGMENTS_DIR, exist_ok=True)
video_file_path = "4.2.2_Flux de navigation_Avr_08_Latest.mp4"
VOICE_CHOICES = ["fr-CA-SylvieNeural", "fr-FR-DeniseNeural", "fr-CA-CHantalNeural"]
DEFAULT_VOICE = VOICE_CHOICES[0]
DEFAULT_RATE = "-15%"
OUTPUT_VIDEO = video_file_path+"_translated_video.mp4"
FINAL_AUDIO_FILE = video_file_path+"_final_voice.mp3"



✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE


In [2]:
DEFAULT_RATE = "-15%"

In [3]:
video_file_path = "4.2.4_Configuration de la solution_Avr_10_Latest.mp4"

In [2]:
# Cell 2: FFmpeg Configuration (Adjust path if necessary)
os.environ["PATH"] = r"C:\ffmpeg\bin" + ";" + os.environ["PATH"]
ffmpeg_path = which("ffmpeg")
print(f"FFmpeg path: {ffmpeg_path}")

FFmpeg path: C:\ffmpeg\bin\ffmpeg.EXE


In [3]:
# Cell 3: Apply nest_asyncio for asynchronous operations
nest_asyncio.apply()
print("nest_asyncio applied.")

nest_asyncio applied.


In [4]:
# --- Debugging Functions ---
def create_translation_log(debug_entries: list) -> str:
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        log_file_path = f"translation_debug_{timestamp}.md"
        with open(log_file_path, "w", encoding="utf-8") as f:
            f.write("# Translation Debug Log\n\n")
            for entry in debug_entries:
                f.write(entry + "\n---\n")
        return log_file_path
    except Exception as e:
        print(f"Failed to create debug log: {e}")
        return None


In [5]:
# --- Core Functions ---
def chunk_text(text: str, max_length: int = 1000) -> list:
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_length:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += " " + sentence
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

def translate_text(text: str, openai_api_key: str = None) -> str: # Added openai_api_key
    try:
        if openai_api_key:
            print("Using OpenAI for translation...")
            return translate_with_openai(text, openai_api_key, target_language="fr")
        else:
            print("Using Google Translator for translation...")
            chunks = chunk_text(text, max_length=512)
            translated_chunks = []
            for chunk in chunks:
                clean_chunk = chunk.strip()
                if not clean_chunk:
                    continue
                try:
                    translated = GoogleTranslator(source='auto', target='fr').translate(clean_chunk)
                    if not translated.strip():
                        raise ValueError("Empty translation")
                    translated_chunks.append(translated)
                except Exception as e:
                    print(f"Translation failed for chunk: {clean_chunk}. Using original text. Error: {e}")
                    translated_chunks.append(clean_chunk)
            return "\n".join(translated_chunks)
    except Exception as e:
        print(f"Translation process failed: {e}")
        return text

import re

def clean_translation(text: str) -> str:
    text = text.strip()

    # 1. If « ... » is present, extract only the content inside
    match = re.search(r'«\s*(.*?)\s*»', text, flags=re.DOTALL)
    if match:
        return match.group(1).strip()

    # 2. If not, remove known intro phrases (case-insensitive)
    intro_phrases = [
        r"^voici\s+(une\s+)?traduction(\s+possible)?\s*[:\-–]*",
        r"^traduction\s*[:\-–]*",
        r"^la\s+phrase\s+traduite\s+est\s*[:\-–]*",
        r"^version\s+traduite\s*[:\-–]*",
        r"^on\s+peut\s+traduire\s+cela\s+par\s*[:\-–]*"
    ]
    for pattern in intro_phrases:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()

    # 3. Remove any remaining quotation marks (French, English, etc.)
    text = re.sub(r'^[«“"\']+\s*', '', text)
    text = re.sub(r'\s*[»”"\']+$', '', text)

    # 4. Remove redundant line breaks
    text = text.replace("\n", " ").replace("\r", "").strip()

    return text


In [6]:
def parse_transcript(transcript: str):
    sentence_groups = []
    current_group = []
    sentence_end_pattern = r'[.!?](?:\s|$)'
    base_segments = []
    for line in transcript.splitlines():
        line = re.sub(r'^Texte\s*:\s*', '', line)
        match = re.search(r'(\d+:\d+)\s*-\s*(\d+:\d+):\s*(.+)$', line)
        if match:
            start = convert_time(match.group(1))
            end = convert_time(match.group(2))
            text = match.group(3).strip()
            base_segments.append((start, end, text))
        else:
            print(f"Line skipped due to incorrect format: {line}")
    if not base_segments:
        raise ValueError("No valid timestamped segments found in the transcript.")
    for seg_start, seg_end, text in base_segments:
        current_group.append((seg_start, seg_end, text))
        if re.search(sentence_end_pattern, text):
            full_text = ' '.join(t for _, _, t in current_group)
            group_start = current_group[0][0]
            group_end = current_group[-1][1]
            sentence_groups.append((group_start, group_end, full_text))
            current_group = []
    if current_group:
        full_text = ' '.join(t for _, _, t in current_group)
        group_start = current_group[0][0]
        group_end = current_group[-1][1]
        sentence_groups.append((group_start, group_end, full_text))
    if not sentence_groups:
        raise ValueError("No valid sentence groups found in the transcript.")
    return sentence_groups

def convert_time(time_str: str) -> int:
    m, s = map(int, time_str.split(':'))
    return m * 60 + s

def convert_seconds_to_time(seconds: int) -> str:
    m, s = divmod(seconds, 60)
    return f"{m:02}:{s:02}"

async def generate_segment_audio(text: str, output_file: str, voice: str, rate: str):
    if not re.match(r"^[+-]?\d+(\.\d+)?%$", rate): # corrected regex
        rate = "-10%"
        print(f"Invalid rate format. Using default: {rate}")
    communicator = edge_tts.Communicate(text, voice, rate=rate)
    try: # added try and except
        await communicator.save(output_file)
    except Exception as e:
        print(f"Error in generate_segment_audio: {e}") # Log the error
        raise

def run_generate_audio_for_segment(text: str, output_file: str, voice: str, rate: str):
    nest_asyncio.apply()
    loop = asyncio.get_event_loop()
    try:
        loop.run_until_complete(generate_segment_audio(text, output_file, voice, rate))
    except Exception as e:
        print(f"Failed to generate audio for segment: {e}")

def generate_transcript(video_path: str) -> str:
    print("Generating transcript using Whisper...")
    model = whisper.load_model("base")
    result = model.transcribe(video_path)
    transcript_lines = []
    for segment in result["segments"]:
        start_min = int(segment["start"] // 60)
        start_sec = int(segment["start"] % 60)
        end_min = int(segment["end"] // 60)
        end_sec = int(segment["end"] % 60)
        text = segment["text"].strip().replace("\n", " ")
        transcript_lines.append(f"{start_min:01d}:{start_sec:02d} - {end_min:01d}:{end_sec:02d}: {text}")
    return "\n".join(transcript_lines)


In [7]:
def create_synchronized_audio(sentence_groups, voice: str, rate: str, progress_callback=None, translation_model=None, openai_api_key=None): # Added translation model and openai key
    from pydub import AudioSegment
    import shutil
    audio_segments = []
    debug_entries = []
    total_sentences = len(sentence_groups)
    if os.path.exists(SEGMENTS_DIR):
        shutil.rmtree(SEGMENTS_DIR)
    os.makedirs(SEGMENTS_DIR, exist_ok=True)
    total_video_duration = sentence_groups[-1][1] * 1000
    cumulative_excess = 0
    translated_segments = []

    for idx, (start, end, sentence) in enumerate(sentence_groups):
        segment_file = os.path.join(SEGMENTS_DIR, f"sentence_{idx}.mp3")
        if not sentence.strip():
            raise ValueError(f"Empty sentence in group {idx+1}")

        translated = sentence # Default
        # Translate each segment based on the user's selection:
        if translation_model == "OpenAI (Cloud)":
            if not openai_api_key:
                raise ValueError("OpenAI API key is required for translation with OpenAI.")
            translated = translate_with_openai(sentence, openai_api_key, target_language="fr")
        elif translation_model == "Ollama (Local)":
            translated = translate_with_ollama(sentence)
        else:
            translated = sentence

        if not translated.strip():
            print(f"Translation failed for sentence {idx+1}. Using original text.")
            translated = sentence
        translated_segments.append(translated)

        # Adjust the speaking rate if the translated text is significantly longer than the original.
        if len(translated) > len(sentence) * 1.2:
            adjusted_rate = f"{int(rate[:-1]) - 5}%"
            print(f"Adjusting speaking rate to {adjusted_rate} for segment {idx+1}.")
            run_generate_audio_for_segment(translated, segment_file, voice, adjusted_rate)
        else:
            run_generate_audio_for_segment(translated, segment_file, voice, rate)

        if not os.path.exists(segment_file) or os.path.getsize(segment_file) == 0:
            raise FileNotFoundError(f"Audio generation failed for sentence {idx+1}")
        try:
            segment_audio = AudioSegment.from_file(segment_file)
        except Exception as e:
            print(f"Pydub failed to load segment audio: {e}. File: {segment_file}")
            raise

        target_duration_ms = (end - start) * 1000
        current_duration = len(segment_audio)
        tolerance_ms = 200
        if abs(current_duration - target_duration_ms) > tolerance_ms:
            if current_duration < target_duration_ms:
                silence_duration = target_duration_ms - current_duration - cumulative_excess
                silence_duration = max(0, silence_duration)
                silence = AudioSegment.silent(duration=silence_duration)
                segment_audio += silence
                cumulative_excess = 0
            elif current_duration > target_duration_ms:
                segment_audio = segment_audio[:target_duration_ms]
                cumulative_excess += current_duration - target_duration_ms
        audio_segments.append(segment_audio)
        debug_entries.append(
            f"Segment {idx+1} (start: {start}s, end: {end}s):\n"
            f"**Original:** {sentence}\n"
            f"**Translated:** {translated}\n"
            f"**Target duration:** {target_duration_ms/1000:.2f}s, "
            f"**Audio duration:** {current_duration/1000:.2f}s, "
            f"**Cumulative excess:** {cumulative_excess/1000:.2f}s"
        )
        if progress_callback:
            progress = (idx + 1) / total_sentences * 80
            progress_callback(progress)

    final_audio = sum(audio_segments)
    final_duration = len(final_audio)
    tolerance_ms = 500 if total_video_duration <= 600000 else 100
    if final_duration < total_video_duration - tolerance_ms:
        silence = AudioSegment.silent(duration=total_video_duration - final_duration)
        final_audio += silence
    elif final_duration > total_video_duration + tolerance_ms:
        excess_duration = final_duration - total_video_duration
        print(f"Final audio exceeds total video duration by {excess_duration / 1000:.2f}s. Redistributing excess duration.")
        adjustment_ratio = excess_duration / len(audio_segments)
        adjusted_segments = []
        for segment in audio_segments:
            adjusted_duration = len(segment) - adjustment_ratio
            adjusted_segments.append(segment[:max(0, int(adjusted_duration))])
        final_audio = sum(adjusted_segments)
    final_audio_duration = len(final_audio)
    if abs(final_audio_duration - total_video_duration) > 100:
        if final_audio_duration < total_video_duration:
            silence = AudioSegment.silent(duration=total_video_duration - final_audio_duration)
            final_audio += silence
        elif final_audio_duration > total_video_duration:
            final_audio = final_audio[:total_video_duration]
    final_audio.export(FINAL_AUDIO_FILE, format="mp3")
    if not os.path.exists(FINAL_AUDIO_FILE):
        raise RuntimeError("Final audio file creation failed")
    if abs(len(AudioSegment.from_file(FINAL_AUDIO_FILE)) - total_video_duration) > 100:
        raise ValueError(f"Final audio duration mismatch: {len(final_audio)/1000:.1f}s vs video {total_video_duration/1000:.1f}s")
    debug_log_path = create_translation_log(debug_entries)
    if not debug_log_path:
        print("Debug log file could not be created.")
    print("Final synchronized audio generated!")
    return FINAL_AUDIO_FILE, debug_log_path


In [8]:
def save_transcript(transcript_text: str, filename: str = "transcript.txt"):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(transcript_text)
    print(f"Transcript saved to {filename}")

def merge_audio_with_video(video_path: str, audio_path: str):
    try:
        print("Merging audio with video...")
        video = VideoFileClip(video_path)
        audio = AudioFileClip(audio_path)
        output_video_path = OUTPUT_VIDEO
        video.set_audio(audio).write_videofile(output_video_path, codec="libx264", audio_codec="aac")
        if not os.path.exists(output_video_path) or os.path.getsize(output_video_path) == 0:
            raise RuntimeError("Merged video file is missing or invalid.")
        return output_video_path
    except Exception as e:
        print(f"Failed to merge audio with video: {e}")
        raise

def translate_with_openai(text: str, api_key: str, target_language: str = "fr") -> str:
    try:
        openai.api_key = api_key
        prompt = f"""You are a professional translator specializing in ERP Cloud Fusion systems.
        Translate the following text into {target_language}, ensuring that technical terms 
        and user interface elements are accurately translated in the context of ERP Cloud Fusion.

        Only return the translated sentence without introductory phrases. 
        Do not add anything beyond the translation itself.

        Text: {text}
        """

        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a professional translator."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=2048,
            temperature=0.3
        )
        return response["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"OpenAI Translation failed: {e}")
        return text

def validate_transcript_format(transcript: str):
    for line in transcript.splitlines():
        line = re.sub(r'^Texte\s*:\s*', '', line)
        if not re.match(r'^\d+:\d+\s*-\s*\d+:\d+:\s*.+$', line):
            print(f"Invalid transcript line format: {line}")

#mistral:instruct
#aya

def translate_with_ollama(text: str, model: str = "7shi/llama-translate:8b-q4_K_M", output_file: str = "ollama_response.txt") -> str:
    try:
        response = ollama.generate(
            model=model,
            prompt = (
                f"Translate the following text into French, ensuring technical terms and UI elements "
                f"are accurately translated in the context of ERP Cloud Fusion.\n\n"
                f"Only return the translated sentence with no extra formatting or commentary.\n\n"
                f"Text: {text}"
            )
        )
        with open(output_file, "w", encoding="utf-8") as f:
            f.write("Ollama Response:\n")
            f.write(str(response))
        if "response" not in response:
            print("Unexpected Ollama response format.")
            return text
        translated_text = response["response"].strip()
        if not translated_text:
            print("Ollama returned an empty or invalid response.")
            return text
        return translated_text
    except Exception as e:
        print(f"Ollama Translation failed: {e}")
        return text


In [14]:
import re
import json
from typing import List, Tuple, Optional
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
import shutil
import os
import openai
import ollama

# Constants (adjust as needed)
SEGMENTS_DIR = "audio_segments"
FINAL_AUDIO_FILE = "final_audio.mp3"
OUTPUT_VIDEO = "output_video.mp4"
DEFAULT_VOICE = "en-US-JennyNeural"
DEFAULT_RATE = "-10%"


def parse_manual_transcript_line(line: str) -> Optional[Tuple[float, float, str]]:
    """
    Parses a line from the manual transcript file, extracting start time,
    end time, and text. Handles potential errors in the time format.
    """
    time_pattern = r"(\d+):(\d+)(?::(\d+))?"  # Allow HH:MM or MM:SS or HH:MM:SS
    match = re.match(
        rf"^{time_pattern}\s*-\s*{time_pattern}:\s*(.+)$", line
    )
    if not match:
        print(f"Skipping invalid transcript line format: {line}")
        return None

    try:
        start_hours, start_minutes, start_seconds, end_hours, end_minutes, end_seconds, text = match.groups()
        start_seconds = int(start_seconds) if start_seconds else 0
        end_seconds = int(end_seconds) if end_seconds else 0
        start_time = (int(start_hours) * 3600 + int(start_minutes) * 60 + start_seconds)
        end_time = (int(end_hours) * 3600 + int(end_minutes) * 60 + end_seconds)
        return start_time, end_time, text.strip()
    except ValueError:
        print(f"Error parsing time values in line: {line}")
        return None
    except Exception as e:
        print(f"Unexpected error parsing transcript line: {e}, line: {line}")
        return None



async def generate_segment_audio(text: str, output_file: str, voice: str, rate: str):
    """Generates audio for a text segment using edge-tts."""
    if not re.match(r"^[+-]?\d+(\.\d+)?%$", rate):
        rate = "-10%"
        print(f"Invalid rate format. Using default: {rate}")
    communicator = edge_tts.Communicate(text, voice, rate=rate)
    try:
        await communicator.save(output_file)
        if os.path.getsize(output_file) == 0:  # Check for empty file
            raise Exception("Empty audio file generated")
    except Exception as e:
        print(f"Error generating audio for segment: {e} (Text: '{text}')")
        print(f"  Exception type: {type(e)}")
        print(f"  Exception args: {e.args}")
        raise  # Re-raise the exception to stop the main process


def get_ollama_response(text: str) -> str:
    """Extracts the translated text from an Ollama response."""
    try:
        response_dict = json.loads(text)
        if "response" in response_dict:
            return response_dict["response"].strip()
    except json.JSONDecodeError:
        pass
    match = re.search(r"Traduction:\s*(.+)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    match = re.search(r"response=\"(.+?)\"", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return text


def clean_translation(text: str) -> str:
    """Cleans up the translated text by removing extra phrases and whitespace."""
    text = text.strip()
    intro_phrases = [
        r"^voici\s+(une\s+)?traduction(\s+possible)?\s*[:\-–]*",
        r"^traduction\s*[:\-–]*",
        r"^la\s+phrase\s+traduite\s+est\s*[:\-–]*",
        r"^version\s+traduite\s*[:\-–]*",
        r"^on\s+peut\s+traduire\s+cela\s+par\s*[:\-–]*",
        r"^translate\s+the\s+following\s+text.*?:",
        r"^text\s*[:\-–]*",
        r"^ollama response:\s*",  # Remove "Ollama Response:"
    ]
    for pattern in intro_phrases:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()
    text = text.replace("\n", " ").replace("\r", "")
    text = re.sub(r"\s+", " ", text).strip()
    return text



def create_synchronized_audio(
    sentence_groups: List[Tuple[float, float, str]],
    voice: str,
    rate: str,
    progress_callback: Optional[callable] = None,
    translation_model: str = "Google Translate",
    openai_api_key: Optional[str] = None,
    manual_transcript_path: Optional[str] = None,
    manual_translations_path: Optional[str] = None,
) -> Tuple[Optional[str], Optional[str]]:
    """
    Generates synchronized audio for a video, with optional manual transcript
    and translations, handling timing from the manual transcript.
    """

    audio_segments = []
    debug_entries = []
    total_sentences = len(sentence_groups)
    if os.path.exists(SEGMENTS_DIR):
        shutil.rmtree(SEGMENTS_DIR)
    os.makedirs(SEGMENTS_DIR, exist_ok=True)
    total_video_duration = sentence_groups[-1][1] * 1000 if sentence_groups else 0
    cumulative_excess = 0
    translated_segments = []

    # Load manual transcript with timing, if provided
    manual_transcript_with_timing: Optional[List[Tuple[float, float, str]]] = None
    if manual_transcript_path:
        try:
            manual_transcript_with_timing = []
            with open(manual_transcript_path, "r", encoding="utf-8") as f:
                for line in f:
                    parsed_line = parse_manual_transcript_line(line)
                    if parsed_line:
                        manual_transcript_with_timing.append(parsed_line)
            if not manual_transcript_with_timing:
                manual_transcript_with_timing = None

        except Exception as e:
            print(f"Error reading manual transcript: {e}. Translations will be auto-generated.")
            manual_transcript_with_timing = None

    # Load manual translations if provided
    manual_translations: Optional[List[str]] = None
    if manual_translations_path:
        try:
            with open(manual_translations_path, "r", encoding="utf-8") as f:
                manual_translations = [line.strip() for line in f.readlines()]
            if len(manual_translations) != total_sentences:
                print(
                    "WARNING: Number of translations in manual translations file does"
                    " not match the number of video segments. Translations will be "
                    "auto-generated."
                )
                manual_translations = None
        except Exception as e:
            print(
                f"Error reading manual translations: {e}. Translations will be "
                "auto-generated."
            )
            manual_translations = None

    # Main processing loop
    for idx, (video_start, video_end, video_sentence) in enumerate(sentence_groups):
        segment_file = os.path.join(SEGMENTS_DIR, f"sentence_{idx}.mp3")
        original_sentence = video_sentence  # Default to video transcript sentence
        translated = ""
        manual_entry = None

        # Find the closest matching manual transcript entry based on time
        if manual_transcript_with_timing:
            best_match_index = -1
            min_time_diff = float('inf')
            for i, (manual_start, manual_end, _) in enumerate(manual_transcript_with_timing):
                # Calculate the overlap between video segment and manual transcript entry
                overlap_start = max(video_start, manual_start)
                overlap_end = min(video_end, manual_end)
                overlap_duration = max(0, overlap_end - overlap_start)
                #if overlap_duration > 0: # Require some overlap
                time_diff_start = abs(video_start - manual_start)
                time_diff_end = abs(video_end - manual_end)
                time_diff = time_diff_start + time_diff_end

                if time_diff < min_time_diff:
                    min_time_diff = time_diff
                    best_match_index = i

            if best_match_index != -1:
                manual_entry = manual_transcript_with_timing[best_match_index]

        if manual_entry:
            manual_start, manual_end, manual_text = manual_entry
            original_sentence = manual_text
            # Use manual translation if available
            if manual_translations and best_match_index < len(manual_translations):
                translated = manual_translations[best_match_index]
                print(f"Using manual translation for segment {idx + 1}")
            else:
                translated = translate_text(original_sentence, translation_model, openai_api_key)
                if translation_model == "Ollama (Local)":
                    translated = get_ollama_response(translated)
                translated = clean_translation(translated)
            translated_segments.append(translated)
            target_duration_ms = (manual_end - manual_start) * 1000
            print(f"  Manual timing: {manual_start}, {manual_end}, duration: {target_duration_ms}")
        else:
            translated = translate_text(video_sentence, translation_model, openai_api_key)
            if translation_model == "Ollama (Local)":
                translated = get_ollama_response(translated)
            translated = clean_translation(translated)
            translated_segments.append(translated)
            target_duration_ms = (video_end - video_start) * 1000

        # Generate audio
        if len(translated) > len(original_sentence) * 1.2:
            adjusted_rate = f"{int(rate[:-1]) - 5}%"
            print(f"Adjusting speaking rate to {adjusted_rate} for segment {idx+1}.")
            run_generate_audio_for_segment(translated, segment_file, voice, adjusted_rate)
        else:
            run_generate_audio_for_segment(translated, segment_file, voice, rate)

        if not os.path.exists(segment_file) or os.path.getsize(segment_file) == 0:
            raise FileNotFoundError(f"Audio generation failed for sentence {idx+1}")
        try:
            segment_audio = AudioSegment.from_file(segment_file)
        except Exception as e:
            print(f"Pydub failed to load segment audio: {e}. File: {segment_file}")
            raise

        current_duration = len(segment_audio)
        tolerance_ms = 200

        if abs(current_duration - target_duration_ms) > tolerance_ms:
            if current_duration < target_duration_ms:
                silence_duration = target_duration_ms - current_duration - cumulative_excess
                silence_duration = max(0, silence_duration)
                silence = AudioSegment.silent(duration=silence_duration)
                segment_audio += silence
                cumulative_excess = 0
            elif current_duration > target_duration_ms:
                segment_audio = segment_audio[:target_duration_ms]
                cumulative_excess += current_duration - target_duration_ms

        audio_segments.append(segment_audio)
        debug_entries.append(
            f"Segment {idx+1} (video start: {video_start}s, video end: {video_end}s):\n"
            f"  Manual start/end/text: {manual_start:.2f} , {manual_end:.2f}, {original_sentence}\n"
            f"**Original:** {original_sentence}\n"
            f"**Translated:** {translated_segments[idx]}\n"
            f"**Target duration:** {target_duration_ms / 1000:.2f}s, "
            f"**Audio duration:** {current_duration / 1000:.2f}s, "
            f"**Cumulative excess:** {cumulative_excess / 1000:.2f}"
        )
        if progress_callback:
            progress = (idx + 1) / total_sentences * 80
            progress_callback(progress)

    if audio_segments:
        final_audio = sum(audio_segments)
        final_duration = len(final_audio)
        tolerance_ms = 500 if total_video_duration <= 600000 else 100
        if final_duration < total_video_duration - tolerance_ms:
            silence = AudioSegment.silent(duration=total_video_duration - final_duration)
            final_audio += silence
        elif final_duration > total_video_duration + tolerance_ms:
            excess_duration = final_duration - total_video_duration
            print(
                "Final audio exceeds total video duration by "
                f"{excess_duration / 1000:.2f}s. Redistributing excess duration."
            )
            adjustment_ratio = excess_duration / len(audio_segments)
            adjusted_segments = []
            for segment in audio_segments:
                adjusted_duration = len(segment) - adjustment_ratio
                adjusted_segments.append(segment[: max(0, int(adjusted_duration))])
            final_audio = sum(adjusted_segments)
        final_audio_duration = len(final_audio)
        if abs(final_audio_duration - total_video_duration) > 100:
            if final_audio_duration < total_video_duration:
                silence = AudioSegment.silent(
                    duration=total_video_duration - final_audio_duration
                )
                final_audio += silence
            elif final_audio_duration > total_video_duration:
                final_audio = final_audio[:total_video_duration]
        final_audio.export(FINAL_AUDIO_FILE, format="mp3")
        if not os.path.exists(FINAL_AUDIO_FILE):
            raise RuntimeError("Final audio file creation failed")
        if abs(len(AudioSegment.from_file(FINAL_AUDIO_FILE)) - total_video_duration) > 100:
            raise ValueError(
                f"Final audio duration mismatch: {len(final_audio)/1000:.1f}s vs video "
                f"{total_video_duration/1000:.1f}s"
            )
        debug_log_path = create_translation_log(debug_entries)
        if not debug_log_path:
            print("Debug log file could not be created.")
        print("Final synchronized audio generated!")
        return FINAL_AUDIO_FILE, debug_log_path
    else:
        print("No audio segments were generated.")
        return None, None



def save_transcript(transcript_text: str, filename: str = "transcript.txt"):
    """Saves the transcript text to a file."""
    with open(filename, "w", encoding="utf-8") as f:
        f.write(transcript_text)
    print(f"Transcript saved to {filename}")


def merge_audio_with_video(video_path: str, audio_path: str):
    """Merges the generated audio with the video file."""
    try:
        print("Merging audio with video...")
        video = VideoFileClip(video_path)
        audio = AudioFileClip(audio_path)
        output_video_path = OUTPUT_VIDEO
        video = video.set_audio(audio)
        video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
        video.close()
        audio.close()
        if not os.path.exists(output_video_path) or os.path.getsize(output_video_path) == 0:
            raise RuntimeError("Merged video file is missing or invalid.")
        return output_video_path
    except Exception as e:
        print(f"Failed to merge audio with video: {e}")
        raise



def translate_with_openai(text: str, api_key: str, target_language: str = "fr") -> str:
    """Translates text using OpenAI's GPT-4."""
    try:
        openai.api_key = api_key
        prompt = f"""You are a professional translator specializing in ERP Cloud Fusion systems.
        Translate the following text into {target_language}, ensuring that technical terms
        and user interface elements are accurately translated in the context of ERP Cloud Fusion.

        Only return the translated sentence without introductory phrases.
        Do not add anything beyond the translation itself.

        Text: {text}
        """
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a professional translator."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=2048,
            temperature=0.3,
        )
        return response["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"OpenAI Translation failed: {e}")
        return text



def validate_transcript_format(transcript: str):
    """Validates the format of the input transcript."""
    for line in transcript.splitlines():
        line = re.sub(r"^Texte\s*:\s*", "", line)
        if not re.match(r"^\d+:\d+\s*-\s*\d+:\d+:\s*.+$", line):
            print(f"Invalid transcript line format: {line}")



def translate_with_ollamaOLD(
    text: str, model: str = "7shi/llama-translate:8b-q4_K_M", output_file: str = "ollama_response.txt"
) -> str:
    """Translates text using a local Ollama model."""
    try:
        response = ollama.generate(
            model=model,
            prompt=(
                "Translate the following text into French, ensuring technical terms and UI elements "
                "are accurately translated in the context of ERP Cloud Fusion.\n\n"
                "Only return the translated sentence with no extra formatting or commentary.\n\n"
                "Text: {text}"
            ),
        )
        with open(output_file, "w", encoding="utf-8") as f:
            f.write("Ollama Response:\n")
            f.write(str(response))
        if "response" not in response:
            print("Unexpected Ollama response format.")
            return text
        translated_text = response["response"].strip()
        if not translated_text:
            print("Ollama returned an empty or invalid response.")
            return text
        return translated_text
    except Exception as e:
        print(f"Ollama Translation failed: {e}")
        return text




def translate_with_ollama(
    text: str, model: str = "7shi/llama-translate:8b-q4_K_M"
) -> str:
    """Translates text using a local Ollama model and extracts the translated text."""
    try:
        response = ollama.generate(
            model=model,
            prompt=(
                "Translate the following text into French, ensuring technical terms and UI elements "
                "are accurately translated in the context of ERP Cloud Fusion.\n\n"
                "Only return the translated sentence with no extra formatting or commentary.\n\n"
                "Text: {text}"
            ),
        )

        # Extract translated text from Ollama response
        try:
            response_dict = json.loads(response)
            if "response" in response_dict:
                translated_text = response_dict["response"].strip()
                return translated_text
        except json.JSONDecodeError:
            pass  # If it's not JSON, proceed with regex

        patterns = [
            r"Texte\s*:\s*(.+)",  # "Texte : "
            r"Traduction\s*:\s*(.+)",  # "Traduction: "
            r"response=\"(.+?)\"",  # response="text"
            r"^(.+)$",             # If no specific prefix, take the whole string
        ]
        for pattern in patterns:
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                translated_text = match.group(1).strip()
                return translated_text

        return response.strip() # Fallback: return the whole response

    except Exception as e:
        print(f"Ollama Translation failed: {e}")
        return text
    
def generate_transcript(video_path: str) -> str:
    print("Extracting audio from video...")
    try:
        video_clip = VideoFileClip(video_path)
        audio_clip = video_clip.audio
        audio_temp_file = "temp_audio.wav"
        audio_clip.write_audiofile(audio_temp_file)
        audio_clip.close()
        video_clip.close()
        print("Transcribing audio content...")
        model = whisper.load_model("base")
        result = model.transcribe(audio_temp_file)
        os.remove(audio_temp_file)
        transcript_lines = []
        for segment in result["segments"]:
            start_min = int(segment["start"] // 60)
            start_sec = int(segment["start"] % 60)
            end_min = int(segment["end"] // 60)
            end_sec = int(segment["end"] % 60)
            text = segment["text"].strip().replace("\n", " ")
            transcript_lines.append(f"{start_min:01d}:{start_sec:02d} - {end_min:01d}:{end_sec:02d}: {text}")
        return "\n".join(transcript_lines)
    except Exception as e:
        print(f"Error during transcript generation: {e}")
        return ""


In [None]:
# 1. Load a video file (replace with your video file)
video_file =  "4.2.4_Configuration de la solution_Avr_10_Latest.mp4"  # CHANGE THIS

# 2. Generate the transcript
transcript_text = generate_transcript(video_file)
print("Original Transcript:\n", transcript_text)

# 3. Parse the transcript
sentence_groups = parse_transcript(transcript_text)

# 4. Choose translation model and set API key if needed
translation_model = "Ollama (Local)"  # Or "OpenAI (Cloud)" or "Google Translate" or "Ollama (Local)"
openai_api_key = os.getenv("OPENAI_API_KEY")  # Or your OpenAI API key

# 5.  Specify paths for manual transcript and translations (optional)
manual_transcript_path = "4.2.4 _ Configuration de la solutio-ORIGINAL.txt"  # Optional: Path to manual transcript
manual_translations_path = "4.2.4 _ Configuration de la solutio-TRANSLATED_latest.txt"  # Optional: Path to manual translations

# 6. Generate the translated audio and merge with video
audio_file, debug_log_path = create_synchronized_audio(
    sentence_groups,
    DEFAULT_VOICE,
    DEFAULT_RATE,
    translation_model=translation_model,
    openai_api_key=openai_api_key,
    manual_transcript_path=manual_transcript_path,  # Pass the manual transcript path
    manual_translations_path=manual_translations_path,  # Pass the manual translations path
)
print("Audio file:", audio_file)
print("Debug log:", debug_log_path)
output_video_file = merge_audio_with_video(video_file, audio_file)
print("Translated video file:", output_video_file)

Extracting audio from video...


chunk:  33%|███▎      | 7890/23563 [14:28<00:02, 5756.76it/s, now=None]

MoviePy - Writing audio in temp_audio.wav


chunk:  33%|███▎      | 7890/23563 [14:31<00:02, 5756.76it/s, now=None]

MoviePy - Done.
Transcribing audio content...
Error during transcript generation: [WinError 32] Le processus ne peut pas accéder au fichier car ce fichier est utilisé par un autre processus: 'temp_audio.wav'
Original Transcript:
 


ValueError: No valid timestamped segments found in the transcript.

In [6]:
import re
import json
from typing import List, Tuple, Optional
from pydub import AudioSegment
from moviepy.editor import VideoFileClip, AudioFileClip
import shutil
import os
import openai
import ollama

# Constants (adjust as needed)
SEGMENTS_DIR = "audio_segments"
FINAL_AUDIO_FILE = "final_audio.mp3"
OUTPUT_VIDEO = "output_video.mp4"
DEFAULT_VOICE = "fr-CA-SylvieNeural"
DEFAULT_RATE = "-10%"


def parse_manual_transcript_line(line: str) -> Optional[Tuple[float, float, str]]:
    """
    Parses a line from the manual transcript file, extracting start time,
    end time, and text.  Handles potential errors in the time format.
    """
    time_pattern = r"(\d+):(\d+)(?::(\d+))?"  # Allow HH:MM or MM:SS or HH:MM:SS
    match = re.match(
        rf"^{time_pattern}\s*-\s*{time_pattern}:\s*(.+)$", line
    )
    if not match:
        print(f"Skipping invalid transcript line: {line}")
        return None

    try:
        start_hours, start_minutes, start_seconds, end_hours, end_minutes, end_seconds, text = match.groups()
        start_seconds = int(start_seconds) if start_seconds else 0
        end_seconds = int(end_seconds) if end_seconds else 0
        start_time = (int(start_hours) * 3600 + int(start_minutes) * 60 + start_seconds)
        end_time = (int(end_hours) * 3600 + int(end_minutes) * 60 + end_seconds)
        return start_time, end_time, text.strip()
    except ValueError:
        print(f"Error parsing time values in line: {line}")
        return None
    except Exception as e:
        print(f"Unexpected error parsing transcript line: {e}, line: {line}")
        return None



async def generate_segment_audio(text: str, output_file: str, voice: str, rate: str):
    """Generates audio for a text segment using edge-tts."""
    if not re.match(r"^[+-]?\d+(\.\d+)?%$", rate):
        rate = "-10%"
        print(f"Invalid rate format. Using default: {rate}")
    communicator = edge_tts.Communicate(text, voice, rate=rate)
    try:
        await communicator.save(output_file)
        if os.path.getsize(output_file) == 0:  # Check for empty file
            raise Exception("Empty audio file generated")
    except Exception as e:
        print(f"Error generating audio for segment: {e} (Text: '{text}')")
        print(f"  Exception type: {type(e)}")
        print(f"  Exception args: {e.args}")
        raise  # Re-raise the exception to stop the main process


def get_ollama_response(text: str) -> str:
    """Extracts the translated text from an Ollama response."""
    try:
        response_dict = json.loads(text)
        if "response" in response_dict:
            return response_dict["response"].strip()
    except json.JSONDecodeError:
        pass
    match = re.search(r"Traduction:\s*(.+)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    match = re.search(r"response=\"(.+?)\"", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return text


def clean_translation(text: str) -> str:
    """Cleans up the translated text by removing extra phrases and whitespace."""
    text = text.strip()
    intro_phrases = [
        r"^voici\s+(une\s+)?traduction(\s+possible)?\s*[:\-–]*",
        r"^traduction\s*[:\-–]*",
        r"^la\s+phrase\s+traduite\s+est\s*[:\-–]*",
        r"^version\s+traduite\s*[:\-–]*",
        r"^on\s+peut\s+traduire\s+cela\s+par\s*[:\-–]*",
        r"^translate\s+the\s+following\s+text.*?:",
        r"^text\s*[:\-–]*",
        r"^ollama response:\s*",  # Remove "Ollama Response:"
    ]
    for pattern in intro_phrases:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()
    text = text.replace("\n", " ").replace("\r", "")
    text = re.sub(r"\s+", " ", text).strip()
    return text



def create_synchronized_audio(
    sentence_groups: List[Tuple[float, float, str]],
    voice: str,
    rate: str,
    progress_callback: Optional[callable] = None,
    translation_model: str = "Google Translate",
    openai_api_key: Optional[str] = None,
    manual_transcript_path: Optional[str] = None,
    manual_translations_path: Optional[str] = None,
) -> Tuple[Optional[str], Optional[str]]:
    """
    Generates synchronized audio for a video, with optional manual transcript
    and translations, handling timing from the manual transcript.
    """

    audio_segments = []
    debug_entries = []
    total_sentences = len(sentence_groups)
    if os.path.exists(SEGMENTS_DIR):
        shutil.rmtree(SEGMENTS_DIR)
    os.makedirs(SEGMENTS_DIR, exist_ok=True)
    total_video_duration = sentence_groups[-1][1] * 1000 if sentence_groups else 0
    cumulative_excess = 0
    translated_segments = []

    # Load manual transcript with timing, if provided
    manual_transcript_with_timing: Optional[List[Tuple[float, float, str]]] = None
    if manual_transcript_path:
        try:
            manual_transcript_with_timing = []
            with open(manual_transcript_path, "r", encoding="utf-8") as f:
                for line in f:
                    parsed_line = parse_manual_transcript_line(line)
                    if parsed_line:
                        manual_transcript_with_timing.append(parsed_line)
            if not manual_transcript_with_timing:
                manual_transcript_with_timing = None

        except Exception as e:
            print(f"Error reading manual transcript: {e}. Translations will be auto-generated.")
            manual_transcript_with_timing = None

    # Load manual translations if provided
    manual_translations: Optional[List[str]] = None
    if manual_translations_path:
        try:
            with open(manual_translations_path, "r", encoding="utf-8") as f:
                manual_translations = [line.strip() for line in f.readlines()]
            if len(manual_translations) != total_sentences:
                print(
                    "WARNING: Number of translations in manual translations file does"
                    " not match the number of video segments. Translations will be "
                    "auto-generated."
                )
                manual_translations = None
        except Exception as e:
            print(
                f"Error reading manual translations: {e}. Translations will be "
                "auto-generated."
            )
            manual_translations = None

    # Main processing loop
    for idx, (video_start, video_end, video_sentence) in enumerate(sentence_groups):
        segment_file = os.path.join(SEGMENTS_DIR, f"sentence_{idx}.mp3")
        original_sentence = video_sentence  # Default to video transcript sentence
        translated = ""

        if manual_transcript_with_timing and idx < len(manual_transcript_with_timing):
            manual_start, manual_end, manual_text = manual_transcript_with_timing[idx]
            original_sentence = manual_text

            # Use manual translation if available
            if manual_translations and idx < len(manual_translations):
                translated = manual_translations[idx]
                print(f"Using manual translation for segment {idx + 1}")
            else:
                translated = translate_text(original_sentence, translation_model, openai_api_key)
                if translation_model == "Ollama (Local)":
                    translated = get_ollama_response(translated)
                translated = clean_translation(translated)
            translated_segments.append(translated)

            # Adjust audio timing to match manual transcript timing
            target_duration_ms = (manual_end - manual_start) * 1000
            print(f"  Manual timing: {manual_start}, {manual_end}, duration: {target_duration_ms}")

        else:  # If no manual transcript entry, use original video timing and translation
            translated = translate_text(video_sentence, translation_model, openai_api_key)
            if translation_model == "Ollama (Local)":
                translated = get_ollama_response(translated)
            translated = clean_translation(translated)
            translated_segments.append(translated)
            target_duration_ms = (video_end - video_start) * 1000

        # Generate audio
        if len(translated) > len(original_sentence) * 1.2:
            adjusted_rate = f"{int(rate[:-1]) - 5}%"
            print(f"Adjusting speaking rate to {adjusted_rate} for segment {idx+1}.")
            run_generate_audio_for_segment(translated, segment_file, voice, adjusted_rate)
        else:
            run_generate_audio_for_segment(translated, segment_file, voice, rate)

        if not os.path.exists(segment_file) or os.path.getsize(segment_file) == 0:
            raise FileNotFoundError(f"Audio generation failed for sentence {idx+1}")
        try:
            segment_audio = AudioSegment.from_file(segment_file)
        except Exception as e:
            print(f"Pydub failed to load segment audio: {e}. File: {segment_file}")
            raise

        current_duration = len(segment_audio)
        tolerance_ms = 200

        if abs(current_duration - target_duration_ms) > tolerance_ms:
            if current_duration < target_duration_ms:
                silence_duration = target_duration_ms - current_duration - cumulative_excess
                silence_duration = max(0, silence_duration)
                silence = AudioSegment.silent(duration=silence_duration)
                segment_audio += silence
                cumulative_excess = 0
            elif current_duration > target_duration_ms:
                segment_audio = segment_audio[:target_duration_ms]
                cumulative_excess += current_duration - target_duration_ms

        audio_segments.append(segment_audio)
        debug_entries.append(
            f"Segment {idx+1} (video start: {video_start}s, video end: {video_end}s):\n"
            f"  Manual start/end/text: {manual_start:.2f} , {manual_end:.2f}, {original_sentence}\n"
            f"**Original:** {original_sentence}\n"
            f"**Translated:** {translated_segments[idx]}\n"
            f"**Target duration:** {target_duration_ms / 1000:.2f}s, "
            f"**Audio duration:** {current_duration / 1000:.2f}s, "
            f"**Cumulative excess:** {cumulative_excess / 1000:.2f}"
        )
        if progress_callback:
            progress = (idx + 1) / total_sentences * 80
            progress_callback(progress)

    if audio_segments:
        final_audio = sum(audio_segments)
        final_duration = len(final_audio)
        tolerance_ms = 500 if total_video_duration <= 600000 else 100
        if final_duration < total_video_duration - tolerance_ms:
            silence = AudioSegment.silent(duration=total_video_duration - final_duration)
            final_audio += silence
        elif final_duration > total_video_duration + tolerance_ms:
            excess_duration = final_duration - total_video_duration
            print(
                "Final audio exceeds total video duration by "
                f"{excess_duration / 1000:.2f}s. Redistributing excess duration."
            )
            adjustment_ratio = excess_duration / len(audio_segments)
            adjusted_segments = []
            for segment in audio_segments:
                adjusted_duration = len(segment) - adjustment_ratio
                adjusted_segments.append(segment[: max(0, int(adjusted_duration))])
            final_audio = sum(adjusted_segments)
        final_audio_duration = len(final_audio)
        if abs(final_audio_duration - total_video_duration) > 100:
            if final_audio_duration < total_video_duration:
                silence = AudioSegment.silent(
                    duration=total_video_duration - final_audio_duration
                )
                final_audio += silence
            elif final_audio_duration > total_video_duration:
                final_audio = final_audio[:total_video_duration]
        final_audio.export(FINAL_AUDIO_FILE, format="mp3")
        if not os.path.exists(FINAL_AUDIO_FILE):
            raise RuntimeError("Final audio file creation failed")
        if abs(len(AudioSegment.from_file(FINAL_AUDIO_FILE)) - total_video_duration) > 100:
            raise ValueError(
                f"Final audio duration mismatch: {len(final_audio)/1000:.1f}s vs video "
                f"{total_video_duration/1000:.1f}s"
            )
        debug_log_path = create_translation_log(debug_entries)
        if not debug_log_path:
            print("Debug log file could not be created.")
        print("Final synchronized audio generated!")
        return FINAL_AUDIO_FILE, debug_log_path
    else:
        print("No audio segments were generated.")
        return None, None



def save_transcript(transcript_text: str, filename: str = "transcript.txt"):
    """Saves the transcript text to a file."""
    with open(filename, "w", encoding="utf-8") as f:
        f.write(transcript_text)
    print(f"Transcript saved to {filename}")


def merge_audio_with_video(video_path: str, audio_path: str):
    """Merges the generated audio with the video file."""
    try:
        print("Merging audio with video...")
        video = VideoFileClip(video_path)
        audio = AudioFileClip(audio_path)
        output_video_path = OUTPUT_VIDEO
        video = video.set_audio(audio)
        video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
        video.close()
        audio.close()
        if not os.path.exists(output_video_path) or os.path.getsize(output_video_path) == 0:
            raise RuntimeError("Merged video file is missing or invalid.")
        return output_video_path
    except Exception as e:
        print(f"Failed to merge audio with video: {e}")
        raise



def translate_with_openai(text: str, api_key: str, target_language: str = "fr") -> str:
    """Translates text using OpenAI's GPT-4."""
    try:
        openai.api_key = api_key
        prompt = f"""You are a professional translator specializing in ERP Cloud Fusion systems.
        Translate the following text into {target_language}, ensuring that technical terms
        and user interface elements are accurately translated in the context of ERP Cloud Fusion.

        Only return the translated sentence without introductory phrases.
        Do not add anything beyond the translation itself.

        Text: {text}
        """
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a professional translator."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=2048,
            temperature=0.3,
        )
        return response["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"OpenAI Translation failed: {e}")
        return text



def validate_transcript_format(transcript: str):
    """Validates the format of the input transcript."""
    for line in transcript.splitlines():
        line = re.sub(r"^Texte\s*:\s*", "", line)
        if not re.match(r"^\d+:\d+\s*-\s*\d+:\d+:\s*.+$", line):
            print(f"Invalid transcript line format: {line}")



def translate_with_ollama(
    text: str, model: str = "7shi/llama-translate:8b-q4_K_M", output_file: str = "ollama_response.txt"
) -> str:
    """Translates text using a local Ollama model."""
    try:
        response = ollama.generate(
            model=model,
            prompt=(
                "Translate the following text into French, ensuring technical terms and UI elements "
                "are accurately translated in the context of ERP Cloud Fusion.\n\n"
                "Only return the translated sentence with no extra formatting or commentary.\n\n"
                "Text: {text}"
            ),
        )
        with open(output_file, "w", encoding="utf-8") as f:
            f.write("Ollama Response:\n")
            f.write(str(response))
        if "response" not in response:
            print("Unexpected Ollama response format.")
            return text
        translated_text = response["response"].strip()
        if not translated_text:
            print("Ollama returned an empty or invalid response.")
            return text
        return translated_text
    except Exception as e:
        print(f"Ollama Translation failed: {e}")
        return text

FINAL HELPERS

In [4]:
import os
import re
import asyncio
import nest_asyncio
import edge_tts
import whisper
from shutil import which
from pydub import AudioSegment
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.audio.io.AudioFileClip import AudioFileClip
from tempfile import NamedTemporaryFile
from deep_translator import GoogleTranslator
import ollama
from datetime import datetime
import glob
import shutil
import gc
import openai
import time # Import time

# --- Configuration ---
ffmpeg_path = which("ffmpeg")
if not ffmpeg_path:
    raise RuntimeError("ffmpeg not found. Please install ffmpeg first.")
print(f"✅ ffmpeg found at: {ffmpeg_path}")

SEGMENTS_DIR = "segments_temp"
os.makedirs(SEGMENTS_DIR, exist_ok=True)

VOICE_CHOICES = ["fr-CA-SylvieNeural", "fr-FR-DeniseNeural", "fr-CA-CHantalNeural"]
DEFAULT_VOICE = VOICE_CHOICES[0]
DEFAULT_RATE = "-10%"
OUTPUT_VIDEO = "translated_video.mp4"
FINAL_AUDIO_FILE = "final_voice.mp3"





import re
import json

async def generate_segment_audio(text: str, output_file: str, voice: str, rate: str):
    if not re.match(r"^[+-]?\d+(\.\d+)?%$", rate):
        rate = "-10%"
        print(f"Invalid rate format. Using default: {rate}")
    communicator = edge_tts.Communicate(text, voice, rate=rate)
    try:
        await communicator.save(output_file)
        if os.path.getsize(output_file) == 0:  # Check for empty file
            raise Exception("Empty audio file generated")
    except Exception as e:
        print(f"Error generating audio for segment: {e} (Text: '{text}')")
        print(f"  Exception type: {type(e)}")  # Print exception type
        print(f"  Exception args: {e.args}")    # Print exception arguments
        raise  # Re-raise the exception to stop the main process

def get_ollama_response(text: str) -> str:
    """
    Extracts the translated text from an Ollama response.  Handles variations
    in the response format.
    """
    # Try to load the text as a JSON object
    try:
        response_dict = json.loads(text)
        if "response" in response_dict:
            return response_dict["response"].strip()
    except json.JSONDecodeError:
        pass  # If it's not valid JSON, continue to regex parsing

    # Fallback to regex parsing (more robust)
    match = re.search(r"Traduction:\s*(.+)", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    match = re.search(r"response=\"(.+?)\"", text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    return text  # Return original if no pattern found

def clean_translation(text: str) -> str:
    text = text.strip()
    intro_phrases = [
        r"^voici\s+(une\s+)?traduction(\s+possible)?\s*[:\-–]*",
        r"^traduction\s*[:\-–]*",
        r"^la\s+phrase\s+traduite\s+est\s*[:\-–]*",
        r"^version\s+traduite\s*[:\-–]*",
        r"^on\s+peut\s+traduire\s+cela\s+par\s*[:\-–]*",
        r"^translate\s+the\s+following\s+text.*?:",
        r"^text\s*[:\-–]*",
        r"^ollama response:\s*",  # Remove "Ollama Response:"
    ]
    for pattern in intro_phrases:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()
    text = text.replace("\n", " ").replace("\r", "")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def create_synchronized_audioOld(sentence_groups, voice, rate, progress_callback=None, translation_model="Google Translate", openai_api_key=None):
    from pydub import AudioSegment
    import shutil
    audio_segments = []
    debug_entries = []
    total_sentences = len(sentence_groups)
    if os.path.exists(SEGMENTS_DIR):
        shutil.rmtree(SEGMENTS_DIR)
    os.makedirs(SEGMENTS_DIR, exist_ok=True)
    total_video_duration = sentence_groups[-1][1] * 1000 if sentence_groups else 0
    cumulative_excess = 0
    translated_segments = []

    for idx, (start, end, sentence) in enumerate(sentence_groups):
        segment_file = os.path.join(SEGMENTS_DIR, f"sentence_{idx}.mp3")
        if not sentence.strip():
            raise ValueError(f"Empty sentence in group {idx+1}")

        translated = translate_text(sentence, translation_model, openai_api_key)
        if translation_model == "Ollama (Local)":
            translated = get_ollama_response(translated)
        translated = clean_translation(translated)
        translated_segments.append(translated) # Keep track of the cleaned translations

        # Adjust the speaking rate if the translated text is significantly longer than the original.
        if len(translated) > len(sentence) * 1.2:
            adjusted_rate = f"{int(rate[:-1]) - 5}%"
            print(f"Adjusting speaking rate to {adjusted_rate} for segment {idx+1}.")
            run_generate_audio_for_segment(translated, segment_file, voice, adjusted_rate)
        else:
            run_generate_audio_for_segment(translated, segment_file, voice, rate)

        if not os.path.exists(segment_file) or os.path.getsize(segment_file) == 0:
            raise FileNotFoundError(f"Audio generation failed for sentence {idx+1}")
        try:
            segment_audio = AudioSegment.from_file(segment_file)
        except Exception as e:
            print(f"Pydub failed to load segment audio: {e}. File: {segment_file}")
            raise

        target_duration_ms = (end - start) * 1000
        current_duration = len(segment_audio)
        tolerance_ms = 200
        if abs(current_duration - target_duration_ms) > tolerance_ms:
            if current_duration < target_duration_ms:
                silence_duration = target_duration_ms - current_duration - cumulative_excess
                silence_duration = max(0, silence_duration)
                silence = AudioSegment.silent(duration=silence_duration)
                segment_audio += silence
                cumulative_excess = 0
            elif current_duration > target_duration_ms:
                segment_audio = segment_audio[:target_duration_ms]
                cumulative_excess += current_duration - target_duration_ms
        audio_segments.append(segment_audio)
        debug_entries.append(
            f"Segment {idx+1} (start: {start}s, end: {end}s):\n"
            f"**Original:** {sentence}\n"
            f"**Translated:** {translated_segments[idx]}\n" # Use the cleaned translation
            f"**Target duration:** {target_duration_ms/1000:.2f}s, "
            f"**Audio duration:** {current_duration/1000:.2f}s, "
            f"**Cumulative excess:** {cumulative_excess/1000:.2f}"
        )
        if progress_callback:
            progress = (idx + 1) / total_sentences * 80
            progress_callback(progress)

    if audio_segments:
        final_audio = sum(audio_segments)
        final_duration = len(final_audio)
        tolerance_ms = 500 if total_video_duration <= 600000 else 100
        if final_duration < total_video_duration - tolerance_ms:
            silence = AudioSegment.silent(duration=total_video_duration - final_duration)
            final_audio += silence
        elif final_duration > total_video_duration + tolerance_ms:
            excess_duration = final_duration - total_video_duration
            print(f"Final audio exceeds total video duration by {excess_duration / 1000:.2f}s. Redistributing excess duration.")
            adjustment_ratio = excess_duration / len(audio_segments)
            adjusted_segments = []
            for segment in audio_segments:
                adjusted_duration = len(segment) - adjustment_ratio
                adjusted_segments.append(segment[:max(0, int(adjusted_duration))])
            final_audio = sum(adjusted_segments)
        final_audio_duration = len(final_audio)
        if abs(final_audio_duration - total_video_duration) > 100:
            if final_audio_duration < total_video_duration:
                silence = AudioSegment.silent(duration=total_video_duration - final_audio_duration)
                final_audio += silence
            elif final_audio_duration > total_video_duration:
                final_audio = final_audio[:total_video_duration]
        final_audio.export(FINAL_AUDIO_FILE, format="mp3")
        if not os.path.exists(FINAL_AUDIO_FILE):
            raise RuntimeError("Final audio file creation failed")
        if abs(len(AudioSegment.from_file(FINAL_AUDIO_FILE)) - total_video_duration) > 100:
            raise ValueError(f"Final audio duration mismatch: {len(final_audio)/1000:.1f}s vs video {total_video_duration/1000:.1f}s")
        debug_log_path = create_translation_log(debug_entries)
        if not debug_log_path:
            print("Debug log file could not be created.")
        print("Final synchronized audio generated!")
        return FINAL_AUDIO_FILE, debug_log_path
    else:
        print("No audio segments were generated.")
        return None, None

from typing import List, Tuple, Optional


def create_synchronized_audio(
    sentence_groups: List[Tuple[float, float, str]],
    voice: str,
    rate: str,
    progress_callback: Optional[callable] = None,
    translation_model: str = "Google Translate",
    openai_api_key: Optional[str] = None,
    manual_transcript_path: Optional[str] = None,  # Added for manual transcript
    manual_translations_path: Optional[str] = None,  # Added for manual translations
) -> Tuple[Optional[str], Optional[str]]:
    """
    Generates synchronized audio for a video, with optional manual transcript and translations.

    Args:
        sentence_groups: A list of tuples, where each tuple contains the start time,
                         end time, and sentence text for a segment.
        voice: The voice to use for speech synthesis.
        rate: The speaking rate.
        progress_callback: An optional callback function to track progress.
        translation_model: The translation model to use ("Google Translate" or "Ollama (Local)").
        openai_api_key: The API key for OpenAI (if using OpenAI).
        manual_transcript_path: Path to a text file containing the manual transcript.
        manual_translations_path: Path to a text file containing manual translations
                                  (one translation per line, corresponding to sentences).

    Returns:
        A tuple containing the path to the final audio file and the path to the debug log.
    """
    from pydub import AudioSegment
    import shutil

    audio_segments = []
    debug_entries = []
    total_sentences = len(sentence_groups)
    if os.path.exists(SEGMENTS_DIR):
        shutil.rmtree(SEGMENTS_DIR)
    os.makedirs(SEGMENTS_DIR, exist_ok=True)
    total_video_duration = sentence_groups[-1][1] * 1000 if sentence_groups else 0
    cumulative_excess = 0
    translated_segments = []

    # Load manual transcript if provided
    manual_transcript: Optional[List[str]] = None
    if manual_transcript_path:
        try:
            with open(manual_transcript_path, "r", encoding="utf-8") as f:
                manual_transcript = [line.strip() for line in f.readlines()]
            if len(manual_transcript) != total_sentences:
                print(
                    "WARNING: Number of sentences in manual transcript does not match"
                    " the number of video segments. Translations will be auto-generated."
                )
                manual_transcript = None  # Ignore the manual transcript
        except Exception as e:
            print(f"Error reading manual transcript: {e}. Translations will be auto-generated.")
            manual_transcript = None

    # Load manual translations if provided
    manual_translations: Optional[List[str]] = None
    if manual_translations_path:
        try:
            with open(manual_translations_path, "r", encoding="utf-8") as f:
                manual_translations = [line.strip() for line in f.readlines()]
            if len(manual_translations) != total_sentences:
                print(
                    "WARNING: Number of translations in manual translations file does"
                    " not match the number of video segments. Translations will be "
                    "auto-generated."
                )
                manual_translations = None  # Ignore manual translations
        except Exception as e:
            print(
                f"Error reading manual translations: {e}. Translations will be "
                "auto-generated."
            )
            manual_translations = None

    for idx, (start, end, sentence) in enumerate(sentence_groups):
        segment_file = os.path.join(SEGMENTS_DIR, f"sentence_{idx}.mp3")
        if not sentence.strip():
            raise ValueError(f"Empty sentence in group {idx+1}")

        # Use manual translation if provided, otherwise, translate automatically
        if manual_translations and idx < len(manual_translations):
            translated = manual_translations[idx]
            print(f"Using manual translation for segment {idx + 1}")
        else:
            translated = translate_text(sentence, translation_model, openai_api_key)
            if translation_model == "Ollama (Local)":
                translated = get_ollama_response(translated)
            translated = clean_translation(translated)
        translated_segments.append(
            translated
        )  # Keep track of the cleaned translations

        # Adjust the speaking rate if the translated text is significantly longer than the original.
        if len(translated) > len(sentence) * 1.2:
            adjusted_rate = f"{int(rate[:-1]) - 5}%"
            print(
                f"Adjusting speaking rate to {adjusted_rate} for segment {idx+1}."
            )
            run_generate_audio_for_segment(
                translated, segment_file, voice, adjusted_rate
            )
        else:
            run_generate_audio_for_segment(translated, segment_file, voice, rate)

        if not os.path.exists(segment_file) or os.path.getsize(segment_file) == 0:
            raise FileNotFoundError(
                f"Audio generation failed for sentence {idx+1}"
            )
        try:
            segment_audio = AudioSegment.from_file(segment_file)
        except Exception as e:
            print(f"Pydub failed to load segment audio: {e}. File: {segment_file}")
            raise

        target_duration_ms = (end - start) * 1000
        current_duration = len(segment_audio)
        tolerance_ms = 200
        if abs(current_duration - target_duration_ms) > tolerance_ms:
            if current_duration < target_duration_ms:
                silence_duration = (
                    target_duration_ms - current_duration - cumulative_excess
                )
                silence_duration = max(0, silence_duration)
                silence = AudioSegment.silent(duration=silence_duration)
                segment_audio += silence
                cumulative_excess = 0
            elif current_duration > target_duration_ms:
                segment_audio = segment_audio[:target_duration_ms]
                cumulative_excess += current_duration - target_duration_ms
        audio_segments.append(segment_audio)
        # Use manual transcript if provided, otherwise, use the original sentence
        original_sentence = (
            manual_transcript[idx] if manual_transcript and idx < len(manual_transcript) else sentence
        )
        debug_entries.append(
            f"Segment {idx+1} (start: {start}s, end: {end}s):\n"
            f"**Original:** {original_sentence}\n"
            f"**Translated:** {translated_segments[idx]}\n"  # Use the cleaned translation
            f"**Target duration:** {target_duration_ms/1000:.2f}s, "
            f"**Audio duration:** {current_duration/1000:.2f}s, "
            f"**Cumulative excess:** {cumulative_excess/1000:.2f}"
        )
        if progress_callback:
            progress = (idx + 1) / total_sentences * 80
            progress_callback(progress)

    if audio_segments:
        final_audio = sum(audio_segments)
        final_duration = len(final_audio)
        tolerance_ms = 500 if total_video_duration <= 600000 else 100
        if final_duration < total_video_duration - tolerance_ms:
            silence = AudioSegment.silent(duration=total_video_duration - final_duration)
            final_audio += silence
        elif final_duration > total_video_duration + tolerance_ms:
            excess_duration = final_duration - total_video_duration
            print(
                "Final audio exceeds total video duration by "
                f"{excess_duration / 1000:.2f}s. Redistributing excess duration."
            )
            adjustment_ratio = excess_duration / len(audio_segments)
            adjusted_segments = []
            for segment in audio_segments:
                adjusted_duration = len(segment) - adjustment_ratio
                adjusted_segments.append(segment[: max(0, int(adjusted_duration))])
            final_audio = sum(adjusted_segments)
        final_audio_duration = len(final_audio)
        if abs(final_audio_duration - total_video_duration) > 100:
            if final_audio_duration < total_video_duration:
                silence = AudioSegment.silent(
                    duration=total_video_duration - final_audio_duration
                )
                final_audio += silence
            elif final_audio_duration > total_video_duration:
                final_audio = final_audio[:total_video_duration]
        final_audio.export(FINAL_AUDIO_FILE, format="mp3")
        if not os.path.exists(FINAL_AUDIO_FILE):
            raise RuntimeError("Final audio file creation failed")
        if abs(len(AudioSegment.from_file(FINAL_AUDIO_FILE)) - total_video_duration) > 100:
            raise ValueError(
                f"Final audio duration mismatch: {len(final_audio)/1000:.1f}s vs video "
                f"{total_video_duration/1000:.1f}s"
            )
        debug_log_path = create_translation_log(debug_entries)
        if not debug_log_path:
            print("Debug log file could not be created.")
        print("Final synchronized audio generated!")
        return FINAL_AUDIO_FILE, debug_log_path
    else:
        print("No audio segments were generated.")
        return None, None





def save_transcript(transcript_text: str, filename: str = "transcript.txt"):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(transcript_text)
    print(f"Transcript saved to {filename}")

def merge_audio_with_video(video_path: str, audio_path: str):
    try:
        print("Merging audio with video...")
        video = VideoFileClip(video_path)
        audio = AudioFileClip(audio_path)
        output_video_path = OUTPUT_VIDEO
        video = video.set_audio(audio)
        video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
        video.close()
        audio.close()
        if not os.path.exists(output_video_path) or os.path.getsize(output_video_path) == 0:
            raise RuntimeError("Merged video file is missing or invalid.")
        return output_video_path
    except Exception as e:
        print(f"Failed to merge audio with video: {e}")
        raise

def translate_with_openai(text: str, api_key: str, target_language: str = "fr") -> str:
    try:
        openai.api_key = api_key
        prompt = f"""You are a professional translator specializing in ERP Cloud Fusion systems.
        Translate the following text into {target_language}, ensuring that technical terms
        and user interface elements are accurately translated in the context of ERP Cloud Fusion.

        Only return the translated sentence without introductory phrases.
        Do not add anything beyond the translation itself.

        Text: {text}
        """

        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a professional translator."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=2048,
            temperature=0.3
        )
        return response["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"OpenAI Translation failed: {e}")
        return text

def validate_transcript_format(transcript: str):
    for line in transcript.splitlines():
        line = re.sub(r'^Texte\s*:\s*', '', line)
        if not re.match(r'^\d+:\d+\s*-\s*\d+:\d+:\s*.+$', line):
            print(f"Invalid transcript line format: {line}")

def translate_with_ollama(text: str, model: str = "7shi/llama-translate:8b-q4_K_M", output_file: str = "ollama_response.txt") -> str:
    try:
        response = ollama.generate(
            model=model,
            prompt = (
                f"Translate the following text into French, ensuring technical terms and UI elements "
                f"are accurately translated in the context of ERP Cloud Fusion.\n\n"
                f"Only return the translated sentence with no extra formatting or commentary.\n\n"
                f"Text: {text}"
            )
        )
        with open(output_file, "w", encoding="utf-8") as f:
            f.write("Ollama Response:\n")
            f.write(str(response))
        if "response" not in response:
            print("Unexpected Ollama response format.")
            return text
        translated_text = response["response"].strip()
        if not translated_text:
            print("Ollama returned an empty or invalid response.")
            return text
        return translated_text
    except Exception as e:
        print(f"Ollama Translation failed: {e}")
        return text




# --- Debugging Functions ---
def create_translation_log(debug_entries: list) -> str:
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        log_file_path = f"translation_debug_{timestamp}.md"
        with open(log_file_path, "w", encoding="utf-8") as f:
            f.write("# Translation Debug Log\n\n")
            for entry in debug_entries:
                f.write(entry + "\n---\n")
        return log_file_path
    except Exception as e:
        print(f"Failed to create debug log: {e}")
        return None

# --- Core Functions ---
def chunk_text(text: str, max_length: int = 1000) -> list:
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_length:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += " " + sentence
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

def translate_text(text: str, translation_model: str = "Google Translate", openai_api_key: str = None) -> str:
    try:
        if translation_model == "OpenAI (Cloud)":
            print("Using OpenAI for translation...")
            return translate_with_openai(text, openai_api_key, target_language="fr")
        elif translation_model == "Ollama (Local)":
            print("Using Ollama for translation...")
            return translate_with_ollama(text)
        elif translation_model == "Google Translate":
            print("Using Google Translate for translation...")
            chunks = chunk_text(text, max_length=512)
            translated_chunks = []
            for chunk in chunks:
                clean_chunk = chunk.strip()
                if not clean_chunk:
                    continue
                try:
                    translated = GoogleTranslator(source='auto', target='fr').translate(clean_chunk)
                    if not translated.strip():
                        raise ValueError("Empty translation")
                    translated_chunks.append(translated)
                except Exception as e:
                    print(f"Translation failed for chunk: {clean_chunk}. Using original text. Error: {e}")
                    translated_chunks.append(clean_chunk)
            return "\n".join(translated_chunks)
        else:
            print(f"Unknown translation model: {translation_model}. Using original text.")
            return text
    except Exception as e:
        print(f"Translation process failed: {e}")
        return text



def parse_transcript(transcript: str):
    sentence_groups = []
    current_group = []
    sentence_end_pattern = r'[.!?](?:\s|$)'
    base_segments = []
    for line in transcript.splitlines():
        line = re.sub(r'^Texte\s*:\s*', '', line)
        match = re.search(r'(\d+:\d+)\s*-\s*(\d+:\d+):\s*(.+)$', line)
        if match:
            start = convert_time(match.group(1))
            end = convert_time(match.group(2))
            text = match.group(3).strip()
            base_segments.append((start, end, text))
        else:
            print(f"Line skipped due to incorrect format: {line}")
    if not base_segments:
        raise ValueError("No valid timestamped segments found in the transcript.")
    for seg_start, seg_end, text in base_segments:
        current_group.append((seg_start, seg_end, text))
        if re.search(sentence_end_pattern, text):
            full_text = ' '.join(t for _, _, t in current_group)
            group_start = current_group[0][0]
            group_end = current_group[-1][1]
            sentence_groups.append((group_start, group_end, full_text))
            current_group = []
    if current_group:
        full_text = ' '.join(t for _, _, t in current_group)
        group_start = current_group[0][0]
        group_end = current_group[-1][1]
        sentence_groups.append((group_start, group_end, full_text))
    if not sentence_groups:
        raise ValueError("No valid sentence groups found in the transcript.")
    return sentence_groups

def convert_time(time_str: str) -> int:
    m, s = map(int, time_str.split(':'))
    return m * 60 + s

def convert_seconds_to_time(seconds: int) -> str:
    m, s = divmod(seconds, 60)
    return f"{m:02}:{s:02}"



def run_generate_audio_for_segment(text: str, output_file: str, voice: str, rate: str):
    nest_asyncio.apply()
    loop = asyncio.get_event_loop()
    try:
        loop.run_until_complete(generate_segment_audio(text, output_file, voice, rate))
    except Exception as e:
        print(f"Failed to generate audio for segment: {e}")

def generate_transcript(video_path: str) -> str:
    print("Extracting audio from video...")
    try:
        video_clip = VideoFileClip(video_path)
        audio_clip = video_clip.audio
        audio_temp_file = "temp_audio.wav"
        audio_clip.write_audiofile(audio_temp_file)
        audio_clip.close()
        video_clip.close()
        print("Transcribing audio content...")
        model = whisper.load_model("base")
        result = model.transcribe(audio_temp_file)
        os.remove(audio_temp_file)
        transcript_lines = []
        for segment in result["segments"]:
            start_min = int(segment["start"] // 60)
            start_sec = int(segment["start"] % 60)
            end_min = int(segment["end"] // 60)
            end_sec = int(segment["end"] % 60)
            text = segment["text"].strip().replace("\n", " ")
            transcript_lines.append(f"{start_min:01d}:{start_sec:02d} - {end_min:01d}:{end_sec:02d}: {text}")
        return "\n".join(transcript_lines)
    except Exception as e:
        print(f"Error during transcript generation: {e}")
        return ""







✅ ffmpeg found at: C:\ffmpeg\bin\ffmpeg.EXE
