<a id="toc"></a>
# Auto Generated Agent Chat: Translating Video audio using Whisper and GPT-3.5-turbo
In this notebook, we demonstrate how to use whisper and GPT-3.5-turbo with `AssistantAgent` and `UserProxyAgent` to recognize and translate
the speech sound from a video file and add the timestamp like a subtitle file based on [agentchat_function_call.ipynb](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_function_call.ipynb)


## Set your API Endpoint

In [None]:
from openai import OpenAI
import whisper
import autogen
from moviepy.editor import VideoFileClip
import os

config_list = [
    {
        'model': 'gpt-4',
        'api_key': os.getenv("OPENAI_API_KEY"),
    }
]

In [None]:
def recognize_transcript_from_video(audio_filepath):
    try:
        # Load model
        model = whisper.load_model("small")
        with VideoFileClip(audio_filepath) as video:
            total_duration = video.duration
        # Transcribe audio
        result = model.transcribe(audio_filepath)
        # Split transcript into sentences and calculate durations
        sentences = result['text'].split('. ')
        duration_per_sentence = total_duration / len(sentences)
        timestamps = [(i * duration_per_sentence, (i + 1) * duration_per_sentence) for i in range(len(sentences))]

        # Construct the transcript with timestamps
        transcript = [
            {"sentence": sentence.strip() + ".", "timestamp_start": start, "timestamp_end": end}
            for sentence, (start, end) in zip(sentences, timestamps)
        ]

        # Save the transcript to a file
        with open("transcription.txt", "w") as file:
            for item in transcript:
                sentence = item["sentence"]
                start_time, end_time = item["timestamp_start"], item["timestamp_end"]
                file.write(f"{start_time}s to {end_time}s: {sentence}\n")

        return transcript

    except FileNotFoundError:
        return "The specified audio file could not be found."
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}"


def translate_text(input_text, source_language, target_language):
    client = OpenAI(api_key=key)

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user",
             "content": f"Translate the following {source_language} text to {target_language}: '{input_text}'"},
        ],
        max_tokens=1500
    )

    # Correctly accessing the response content
    translated_text = response.choices[0].message.content if response.choices else None
    return translated_text


def translate_transcript(source_language, target_language):
    with open("transcription.txt", "r") as f:
        lines = f.readlines()

    translated_transcript = []

    for line in lines:
        # Split each line into timestamp and text parts
        parts = line.strip().split(': ')
        if len(parts) == 2:
            timestamp, text = parts[0], parts[1]
            # Translate only the text part
            translated_text = translate_text(text, source_language, target_language)
            # Reconstruct the line with the translated text and the preserved timestamp
            translated_line = f"{timestamp}: {translated_text}"
            translated_transcript.append(translated_line)
        else:
            # If the line doesn't contain a timestamp, add it as is
            translated_transcript.append(line.strip())

    return '\n'.join(translated_transcript)


llm_config = {
    "functions": [
        {
            "name": "recognize_transcript_from_video",
            "description": "recognize the speech from video and transfer into a txt file",
            "parameters": {
                "type": "object",
                "properties": {
                    "audio_filepath": {
                        "type": "string",
                        "description": "path of the video file",
                    }
                },
                "required": ["audio_filepath"],
            },
        },
        {
            "name": "translate_transcript",
            "description": "using translate_text function to translate the script",
            "parameters": {
                "type": "object",
                "properties": {
                    "source_language": {
                        "type": "string",
                        "description": "source language",
                    },
                    "target_language": {
                        "type": "string",
                        "description": "target language",
                    }
                },
                "required": ["source_language", "target_language"],
            },
        },
    ],
    "config_list": config_list,
    "timeout": 120,
}
source_language = "English"
target_language = "Chinese"
key = os.getenv("OPENAI_API_KEY")
target_video = "your_file_path"

chatbot = autogen.AssistantAgent(
    name="chatbot",
    system_message="For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.",
    llm_config=llm_config,
)

user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config={"work_dir": "coding_2"},
)

user_proxy.register_function(
    function_map={
        "recognize_transcript_from_video": recognize_transcript_from_video,
        "translate_transcript": translate_transcript,
    }
)
user_proxy.initiate_chat(
    chatbot,
    message=f"For the video located in {target_video}, recognize the speech and transfer into a script file, "
            f"then translate from {source_language} text to {target_language} as a subtitle text. ",
)