In [None]:
!pip install moviepy
!pip install openai-whisper
!pip install deep-translator
!pip install gTTS
!sudo apt install ffmpeg


Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-

In [None]:
!pip install jiwer
!pip install pydub # Install the missing pydub library

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import whisper
from moviepy.editor import VideoFileClip, AudioFileClip
from deep_translator import GoogleTranslator
from gtts import gTTS
from IPython.display import Audio
from jiwer import wer
from pydub import AudioSegment
import os

# ------------------ Core Processing Functions ------------------ #

def extract_audio_from_video(video_path):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio_path = "temp_audio.wav"
    audio.write_audiofile(audio_path, ffmpeg_params=["-ac", "1"])
    return audio_path

def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"]

def translate_text(text, target_language="te"):
    translated = GoogleTranslator(source='auto', target=target_language).translate(text)
    return translated

def synthesize_speech(text, language="te"):
    tts = gTTS(text=text, lang=language)
    audio_path = "dubbed_audio.mp3"
    tts.save(audio_path)
    return audio_path

def sync_audio_to_video(video_path, audio_path):
    video = VideoFileClip(video_path)
    audio = AudioFileClip(audio_path)
    final_video = video.set_audio(audio)
    output_path = "dubbed_video.mp4"
    final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
    return output_path

# ------------------ Metrics Calculation Functions ------------------ #

def transcription_accuracy(predicted, actual):
    error = wer(actual, predicted)
    return (1 - error) * 100

def translation_accuracy(reference, prediction):
    ref_words = set(reference.split())
    pred_words = set(prediction.split())
    common = ref_words & pred_words
    return len(common) / len(ref_words) * 100 if ref_words else 0

def tts_duration_accuracy(original_audio_path, synthesized_audio_path):
    orig = AudioSegment.from_file(original_audio_path)
    synth = AudioSegment.from_file(synthesized_audio_path)
    accuracy = min(len(orig), len(synth)) / max(len(orig), len(synth)) * 100
    return accuracy

# ------------------ Main Pipeline ------------------ #

def process_video_locally(input_video_path, target_language="te", ground_truth_transcript=None, ref_translation=None):
    print("1️⃣ Extracting audio...")
    audio_path = extract_audio_from_video(input_video_path)

    print("2️⃣ Transcribing audio...")
    transcribed_text = transcribe_audio(audio_path)
    print("🔍 Transcribed Text:", transcribed_text)

    print("3️⃣ Translating text...")
    translated_text = translate_text(transcribed_text, target_language)
    print("🌐 Translated Text:", translated_text)

    print("4️⃣ Synthesizing speech...")
    dubbed_audio_path = synthesize_speech(translated_text, target_language)

    print("5️⃣ Combining video and dubbed audio...")
    output_video = sync_audio_to_video(input_video_path, dubbed_audio_path)
    print(f"✅ Dubbed video created: {output_video}")

    # ---------- Metrics ----------
    print("\n📊 Performance Metrics:")

    # if ground_truth_transcript:
    #     acc = transcription_accuracy(transcribed_text, ground_truth_transcript)
    #     print(f"📝 Transcription Accuracy: {round(acc, 2)} %")

    # if ref_translation:
    #     tr_acc = translation_accuracy(ref_translation, translated_text)
    #     print(f"🌐 Translation Accuracy: {round(tr_acc, 2)} %")

    dur_acc = tts_duration_accuracy(audio_path, dubbed_audio_path)
    print(f"🔊 TTS Duration Accuracy: {round(dur_acc, 2)} %")

    return output_video

# ------------------ Example Usage ------------------ #

if __name__ == "__main__":
    input_video = "/content/WhatsApp Video 2025-04-16 at 21.39.12.mp4"  # Update this path

    # Get target language from user input
    target_lang = input("Enter the target language (e.g., te, hi, fr, es): ")

    # ... rest of your code ...

    # OPTIONAL: Provide known ground truth for metrics
    ground_truth_transcript = "This is a sample video with clear speech"  # Manually provided
    ref_translation = "ఇది స్పష్టమైన మాటలతో కూడిన నమూనా వీడియో"  # Reference Telugu translation

    process_video_locally(input_video, target_lang, ground_truth_transcript, ref_translation)


Enter the target language (e.g., te, hi, fr, es): te
1️⃣ Extracting audio...
MoviePy - Writing audio in temp_audio.wav




MoviePy - Done.
2️⃣ Transcribing audio...





🔍 Transcribed Text:  انیفیسٹیشن کی ایک مویمے میں نے دیکھیں جمعے اس ایٹھ سرندڈ میں تھا 10 سیگریٹ موی اور وہ 10 سیگریٹ مویسے میری life change ہوئی ہے اب life kye سے چینج ہوئی اس مویمے ایک زیگریٹھے باتایا ہے کہ آپ کو kye سے بنیفیسٹ کرنا ہے جس کہ ایک ایک زام پلتا رات کو جب گاڑی چلتی ہے اس کے حلیٹ چالو رہتے اب نے کو پورا ہے نہیں رکھتا ایک سٹین لیمٹتک دیکھتا لیکن جیسے سے آگے پڑھیں ایک سٹین اٹھ مویمے ایک زام پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں جیسے سے آگے پڑھیں گے
3️⃣ Translating text...
🌐 Translated Text: నేను చూసిన ముట్టడిలో ఒకటి ఈ ఇథి సింగ్డ్ శుక్రవారం, 10 సిగరెట్లు మరియు ఆమె 10 సిగరెట్లు. ఇప్పుడు లైఫ్ కై, ఈ మొక్కజొన్న ఒక ఉత్సాహంతో మాట్లాడాడు. కానీ ఒక స్టాన్ -ఇ -మమ్



MoviePy - Done.
Moviepy - Writing video dubbed_video.mp4







Moviepy - Done !
Moviepy - video ready dubbed_video.mp4
✅ Dubbed video created: dubbed_video.mp4

📊 Performance Metrics:
🔊 TTS Duration Accuracy: 44.81 %
