In [None]:
# pip install boto3

In [None]:
# pip install awscli

In [None]:
# pip install sentencepiece

In [None]:
import gradio as gr
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM
from dotenv import load_dotenv
import os
import subprocess
import torch
#Google Text to Speech
from gtts import gTTS
import tempfile

import boto3

In [None]:
# Load environment variables.
load_dotenv()

# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
# Store the API key in a variable.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
# This function transcribes audio to text using Whisper in the original language it was spoken
def transcribe_audio_original(audio_filepath):
    try:
        transcription_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large")
        transcription_result = transcription_pipeline(audio_filepath)
        transcribed_text = transcription_result['text']
        return transcribed_text
    except Exception as e:
        print(f"an error occured: {e}")
        return "Error in transcription"

In [None]:
from transformers import pipeline
from transformers import MarianMTModel, MarianTokenizer

# This function transcribes audio to text and then translates it into the specified language
def translate(transcribed_text, src_lang="en", target_lang="es"):
    try:
        #Define the model and tokenizer
        model_name =f"Helsinki-NLP/opus-mt-{src_lang}-{target_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        #tokenize the text
        encoded_text = tokenizer(text, return_tensors="pt", padding=True)
        
        #generate translation using the model
        translated_tokens = model.generate(**encoded_text)
        
        #decode the translated tokens
        translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        
        return translated_text
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error in transcription or translation"

In [None]:
# test funciton to make sure it works
transcribed_text = transcribe_audio_original('speech.mp3')

In [None]:
print(f"{transcribed_text}")


In [None]:
# text = "trying to see if we can translate this into Spanish"
translated_text = translate(transcribed_text, "en", "es")
print("Translated text:", translated_text)

In [None]:
# Define function to translate text to speech for output
# Uses Google Text-to-speech

def text_to_speech(text):
    tts = gTTS(text, lang='en')
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    return temp_file.name

In [None]:
# Define text-to-speech function using Amazon Polly

def polly_text_to_speech(text, voice_id):
    
    try:
    
        #initialize boto3 client for polly
        polly_client = boto3.client('polly')
        
        #request speech synthesis
        response = polly_client.synthesize_speech(
            Engine = 'neural',
            Text=text,
            OutputFormat='mp3',
            VoiceId=voice_id
        )
        
        # Save the audio to a temporary file and return its path
        if "AudioStream" in response:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as audio_file:
                audio_file.write(response['AudioStream'].read())
                return audio_file.name
    except boto3.exceptions.Boto3Error as e:
        print(f"Error accessing Polly: {e}")
    return None  # Return None if there was an error
    

In [None]:
def combined_function (audio_filepath):
    transcribed_text = transcribe_audio_original(audio_filepath)
    speech = polly_text_to_speech(transcribed_text, "Gregory")
    translation = translate(transcribed_text, "en", "es")
    translated_speech = polly_text_to_speech(translation, "Mia")
    return transcribed_text, translation, speech, translated_speech

In [None]:
# Create Gradio app to:
# 1. transcribe spoken audio to text
# 2. output transcribed text as speech

input_audio = gr.Audio(
            label="click on microphone to record audio", 
            type="filepath", 
            #vaveform options customize the color of the wave seen when recording/playing.
            waveform_options = gr.WaveformOptions(
                waveform_color="#01C6FF",
                waveform_progress_color="#0066B4",
                skip_length=2,
                show_controls=False,
            ),
)

app = gr.Interface(
    fn=combined_function,
    inputs=input_audio,
    outputs=[
        gr.Textbox(label="Transcribed audio"),
        gr.Textbox(label="Translated text"),
        gr.Audio(label="English speech"),
        gr.Audio(label="Translated speech")],
    title="Audio Transcription and Text to Speech",
    description="click on the microphone to record audio, then receive transcription in text and speech.")
    
app.launch(show_error=True, share=True) #uncomment share=True in google colab