In [7]:
import ffmpeg
import whisper
import json
import re
import os

# Step 1: Extract Audio from Video
def extract_audio(video_path, audio_path):
    if not os.path.exists(video_path):
        print(f"Video file {video_path} does not exist")
        return False

    try:
        ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True, capture_stdout=True, capture_stderr=True)
        print("Audio extraction successful")
        return True
    except ffmpeg.Error as e:
        print(f"An error occurred: {e.stderr.decode('utf-8')}")
        return False

# Step 2: Transcribe Audio with Whisper
def transcribe_audio_with_whisper(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path, word_timestamps=True)
    
    # Save the transcription result to a file
    with open("transcription.json", "w", encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
    
    return result

# Step 3: Split Transcription into Sentences
def split_into_sentences(transcription_result):
    sentences = []
    current_sentence = ""
    current_start = None

    if 'segments' not in transcription_result or not transcription_result['segments']:
        print("No segments found in the transcription result")
        return sentences

    for segment in transcription_result['segments']:
        for word in segment['words']:
            if current_start is None:
                current_start = word['start']

            current_sentence += word['word'] + " "
            
            if re.match(r'[.!?]', word['word']):
                sentences.append({
                    "start": current_start,
                    "end": word['end'],
                    "sentence": current_sentence.strip()
                })
                current_sentence = ""
                current_start = None

    # If there is any remaining sentence
    if current_sentence:
        last_word_end = transcription_result['segments'][-1]['words'][-1]['end'] if transcription_result['segments'] and transcription_result['segments'][-1]['words'] else None
        sentences.append({
            "start": current_start,
            "end": last_word_end,
            "sentence": current_sentence.strip()
        })

    return sentences

# Step 4: Save Sentences to Text Files
def save_sentences_to_files(sentences):
    for i, sentence in enumerate(sentences):
        with open(f"sentence_{i}.txt", "w", encoding='utf-8') as f:
            f.write(sentence['sentence'])


# Main Script
video_path = 'input_video.mp4'  # Make sure this is the correct path to your video file
audio_path = 'audio.wav'



# Extract audio from the video
if extract_audio(video_path, audio_path):
    # Transcribe audio with Whisper
    transcription_result = transcribe_audio_with_whisper(audio_path)

    # Split transcription into sentences
    sentences = split_into_sentences(transcription_result)

    # Save sentences to text files
    save_sentences_to_files(sentences)


Audio extraction successful


In [8]:
os.environ['OPENAI_API_KEY'] ="sk-proj-HZo39yrvh8PO8C8pKWNpT3BlbkFJkZKq1l2JpV339jMUfX0p"
os.environ['ELEVENLABS_API_KEY'] ="sk_1f61682e0b96b0f53580d385ad097ec8ee062217127058fc"

In [9]:
import os
from openai import OpenAI

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

def translate_text(text, api_key):
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": "Translate the following text to English, keep timestamps and format intact"
            },
            {
                "role": "user",
                "content": text
            },
        ],
        temperature=1,
        max_tokens=2560,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    response = response.choices[0].message.content

    return response 

def save_translated_text(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    # File paths
input_file_path = 'sentence_timestamps.txt'  # Input file with sentences
output_file_path = 'sentence_timestamps_en.txt'  # Output file for translated sentences
openai_api_key = os.getenv('OPENAI_API_KEY')
    # Read content from the file
content = read_file(input_file_path)

    # Translate the content
translated_content = translate_text(content, openai_api_key)
# print(translated_content)
    # Save the translated content to a new file
save_translated_text(output_file_path, translated_content)



In [26]:
import os
import re
import requests

CHUNK_SIZE = 1024  # Size of chunks to read/write at a time
XI_API_KEY = os.getenv('ELEVENLABS_API_KEY')  # Your API key for authentication
VOICE_ID = "XfNU2rGpBa01ckF309OY"  # ID of the voice model to use
OUTPUT_DIR = "output"  # Directory to save the output audio files

def read_translated_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    timestamps = []
    sentences = []
    for line in lines:
        # Use regular expressions to extract start, end, and sentence
        match = re.match(r'start: ([\d.]+), end: ([\d.]+), sentence: (.+)', line)
        if match:
            start = float(match.group(1))
            end = float(match.group(2))
            sentence = match.group(3).strip()
            timestamps.append((start, end))
            sentences.append(sentence)

    return timestamps, sentences

def generate_audio(text, api_key, voice_id, output_path):
    tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream"
    headers = {
        "Accept": "application/json",
        "xi-api-key": api_key
    }
    data = {
        "text": text,
        "model_id": "eleven_multilingual_v2",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.8,
            "style": 0.0,
            "use_speaker_boost": True
        }
    }

    response = requests.post(tts_url, headers=headers, json=data, stream=True)

    if response.ok:
        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                f.write(chunk)
        return output_path
    else:
        print(f"Failed to generate audio for text: {text}")
        raise Exception(f"Failed to generate audio: {response.text}")

def main():
    translated_file_path = 'sentence_timestamps_en.txt'
    elevenlabs_api_key = XI_API_KEY


    timestamps, sentences = read_translated_file(translated_file_path)

    for i, sentence in enumerate(sentences):
        audio_file_path = os.path.join(OUTPUT_DIR, f"output_audio_{i}.mp3")
        generate_audio(sentence, elevenlabs_api_key, VOICE_ID, audio_file_path)
        print(f"Generated audio file for sentence {i}: {audio_file_path}")

if __name__ == "__main__":
    main()


In [20]:
import os
from moviepy.editor import VideoFileClip, AudioFileClip
from pydub import AudioSegment

OUTPUT_DIR = "output"  # Directory where the output audio files are saved

def read_translated_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    timestamps = []
    sentences = []
    for line in lines:
        parts = line.split(', ')
        start = float(parts[0].split(': ')[1])
        end = float(parts[1].split(': ')[1])
        sentence = parts[2].split(': ')[1].strip()
        timestamps.append((start, end))
        sentences.append(sentence)
    
    return timestamps, sentences

def synchronize_audio_with_video(video_file_path, timestamps, output_video_path):
    video_clip = VideoFileClip(video_file_path)
    final_audio = AudioSegment.silent(duration=int(video_clip.duration * 1000))

    for i, (start, end) in enumerate(timestamps):
        audio_file_path = os.path.join(OUTPUT_DIR, f"output_audio_{i}.mp3")
        segment = AudioSegment.from_mp3(audio_file_path)
        start_ms = int(start * 1000)
        
        print(f"Overlaying segment {i}: start={start_ms}ms, duration={len(segment)}ms")
        
        final_audio = final_audio.overlay(segment, position=start_ms)

    final_audio_path = os.path.join(OUTPUT_DIR, "final_audio.mp3")
    final_audio.export(final_audio_path, format="mp3")

    final_video = video_clip.set_audio(AudioFileClip(final_audio_path))
    final_video.write_videofile(output_video_path, codec="libx264", audio_codec="aac")

def main():
    translated_file_path = 'sentence_timestamps_en.txt'
    video_file_path = 'input_video.mp4'
    output_video_path = os.path.join(OUTPUT_DIR, 'output_video_with_audio.mp4')

    timestamps, _ = read_translated_file(translated_file_path)

    synchronize_audio_with_video(video_file_path, timestamps, output_video_path)

if __name__ == "__main__":
    main()


Overlaying segment 0: start=0ms, duration=862ms
Overlaying segment 1: start=10520ms, duration=653ms
Overlaying segment 2: start=11180ms, duration=2586ms
Overlaying segment 3: start=13720ms, duration=1202ms
Overlaying segment 4: start=14600ms, duration=8960ms
Overlaying segment 5: start=24560ms, duration=4624ms
Overlaying segment 6: start=37380ms, duration=1384ms
Overlaying segment 7: start=43000ms, duration=653ms
Overlaying segment 8: start=54240ms, duration=679ms
Overlaying segment 9: start=55760ms, duration=1254ms
Overlaying segment 10: start=58040ms, duration=2090ms
Overlaying segment 11: start=66080ms, duration=1071ms
Overlaying segment 12: start=72080ms, duration=2325ms
Overlaying segment 13: start=78520ms, duration=653ms
Overlaying segment 14: start=82680ms, duration=7523ms
Overlaying segment 15: start=107640ms, duration=2116ms
Overlaying segment 16: start=118300ms, duration=3840ms
Overlaying segment 17: start=127920ms, duration=2926ms
Overlaying segment 18: start=139320ms, durat

                                                                      

MoviePy - Done.
Moviepy - Writing video output\output_video_with_audio.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready output\output_video_with_audio.mp4
