In [1]:
# pip install boto3

In [2]:
# pip install awscli

In [21]:
# pip install sentencepiece

Collecting sentencepiece
  Obtaining dependency information for sentencepiece from https://files.pythonhosted.org/packages/a2/f6/587c62fd21fc988555b85351f50bbde43a51524caafd63bc69240ded14fd/sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/991.5 kB ? eta -:--:--
   --- ------------------------------------ 92.2/991.5 kB 1.3 MB/s eta 0:00:01
   ------- -------------------------------- 184.3/991.5 kB 1.6 MB/s eta 0:00:01
   -------------------- ------------------- 512.0/991.5 kB 3.2 MB/s eta 0:00:01
   --------------------------------- ------ 839.7/991.5 kB 4.1 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 4.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed 

In [1]:
import gradio as gr
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM
from dotenv import load_dotenv
import os
import subprocess
import torch
#Google Text to Speech
from gtts import gTTS
import tempfile

import boto3




In [4]:
# Load environment variables.
load_dotenv()

# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
# Store the API key in a variable.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
# This function transcribes audio to text using Whisper in the original language it was spoken
def transcribe_audio_original(audio_filepath):
    try:
        transcription_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large")
        transcription_result = transcription_pipeline(audio_filepath)
        transcribed_text = transcription_result['text']
        return transcribed_text
    except Exception as e:
        print(f"an error occured: {e}")
        return "Error in transcription"

In [2]:
from transformers import pipeline
from transformers import MarianMTModel, MarianTokenizer

# This function transcribes audio to text and then translates it into the specified language
def translate(transcribed_text, src_lang="en", target_lang="es"):
    try:
        #Define the model and tokenizer
        model_name =f"Helsinki-NLP/opus-mt-{src_lang}-{target_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        #tokenize the text
        encoded_text = tokenizer(text, return_tensors="pt", padding=True)
        
        #generate translation using the model
        translated_tokens = model.generate(**encoded_text)
        
        #decode the translated tokens
        translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        
        return translated_text
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error in transcription or translation"

In [14]:
# test funciton to make sure it works
transcribed_text = transcribe_audio_original('speech.mp3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
print(f"{transcribed_text}")


 Hello, how are you? My name is Gregory and I need to say more words to find out how is sound.


In [16]:
# text = "trying to see if we can translate this into Spanish"
translated_text = translate(transcribed_text, "en", "es")
print("Translated text:", translated_text)



Translated text: Hola, ¿cómo estás? Mi nombre es Gregory y necesito decir más palabras para averiguar cómo suena.


In [20]:
# Define function to translate text to speech for output
# Uses Google Text-to-speech

def text_to_speech(text):
    tts = gTTS(text, lang='en')
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    return temp_file.name

In [25]:
# Define text-to-speech function using Amazon Polly

def polly_text_to_speech(text, voice_id):
    
    try:
    
        #initialize boto3 client for polly
        polly_client = boto3.client('polly')
        
        #request speech synthesis
        response = polly_client.synthesize_speech(
            Engine = 'neural',
            Text=text,
            OutputFormat='mp3',
            VoiceId=voice_id
        )
        
        # Save the audio to a temporary file and return its path
        if "AudioStream" in response:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as audio_file:
                audio_file.write(response['AudioStream'].read())
                return audio_file.name
    except boto3.exceptions.Boto3Error as e:
        print(f"Error accessing Polly: {e}")
    return None  # Return None if there was an error
    

In [28]:
def combined_function (audio_filepath):
    transcribed_text = transcribe_audio_original(audio_filepath)
    speech = polly_text_to_speech(transcribed_text, "Gregory")
    translation = translate(transcribed_text, "en", "es")
    translated_speech = polly_text_to_speech(translation, "Mia")
    return transcribed_text, translation, speech, translated_speech

In [29]:
# Create Gradio app to:
# 1. transcribe spoken audio to text
# 2. output transcribed text as speech

input_audio = gr.Audio(
            label="click on microphone to record audio", 
            type="filepath", 
            #vaveform options customize the color of the wave seen when recording/playing.
            waveform_options = gr.WaveformOptions(
                waveform_color="#01C6FF",
                waveform_progress_color="#0066B4",
                skip_length=2,
                show_controls=False,
            ),
)

app = gr.Interface(
    fn=combined_function,
    inputs=input_audio,
    outputs=[
        gr.Textbox(label="Transcribed audio"),
        gr.Textbox(label="Translated text"),
        gr.Audio(label="English speech"),
        gr.Audio(label="Translated speech")],
    title="Audio Transcription and Text to Speech",
    description="click on the microphone to record audio, then receive transcription in text and speech.")
    
app.launch(show_error=True) #share=True) #uncomment share=True in google colab

Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# create combined function with both transcribe audio and text_to_speech

def combined_function (audio_filepath, language_option):
    # language = language_option[0]
    transcribed_text = transcribe_audio_original(audio_filepath)
    
    # map languages to Polly voice IDs
    voice_map = {
        "English": "Joanna",
        "Spanish": "Conchita",
        "French": "Celine",
        "German": "Marlene",
        "Chinese": "Zhiyu"
    }
    voice_id = voice_map.get(language_option, "Joanna")
    transcribed_text_new = transcribe_and_translate(audio_filepath)
    speech_file_path = polly_text_to_speech(transcribed_text, voice_id)
    return transcribed_text, transcribed_text_new, speech_file_path


In [None]:
def test_function(audio_filepath, language):
    return str(language)  # This will show what `language` is being received as.


In [None]:
# Create Gradio app to:
# 1. transcribe spoken audio to text
# 2. output transcribed text as speech

languages = ["English", "Spanish", "French", "German", "Chinese"]
input_audio = gr.Audio(label="Click on the microphone to record audio", type="filepath",
                 waveform_options=gr.WaveformOptions(
                     waveform_color="#01C6FF",
                     waveform_progress_color="#0066B4",
                     skip_length=2,
                     show_controls=False,
                     )
)
dropdown = gr.Dropdown(languages, label="Select Language")

app = gr.Interface(
    fn=combined_function,
    inputs=[input_audio, dropdown],
    outputs=[
        gr.Textbox(label="Transcribed audio"),
        gr.Audio(label="Text to speech output")],
    title="Audio Transcription and Text to Speech",
    description="Select a language, click on the microphone to record audio, then receive transcription in text and speech.")
    
app.launch(show_error=True) #share=True) #uncomment share=True in google colab

In [None]:
# Create Gradio app to:
# 1. transcribe spoken audio to text
# 2. output transcribed text as speech

languages = ["English", "Spanish", "French", "German", "Chinese"]
input_audio = gr.Audio(label="Click on the microphone to record audio", type="filepath",
                 waveform_options=gr.WaveformOptions(
                     waveform_color="#01C6FF",
                     waveform_progress_color="#0066B4",
                     skip_length=2,
                     show_controls=False,
                     ))
dropdown = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Chinese"], label="Select Language")

app = gr.Interface(
    fn=combined_function,
    inputs=[input_audio, dropdown],
    outputs=[
        gr.Textbox(label="Transcribed audio"),
        gr.Audio(label="Text to speech output")],
    title="Audio Transcription and Text to Speech",
    description="Select a language, click on the microphone to record audio, then receive transcription in text and speech.")
    
app.launch(show_error=True) #share=True) #uncomment share=True in google colab