In [36]:
import torch
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [49]:
#french split of VoxPopuli dataset
from datasets import load_dataset

dataset = load_dataset("facebook/voxpopuli", "fr", split="validation", streaming=True)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [50]:
sample = next(iter(dataset))

In [40]:
#examine audio data 
from IPython.display import Audio

Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [41]:
#function to translate audio input to text
def translate(audio):
    #translate to any english (default)
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    #transcribe to any language
    #outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
    return outputs["text"]

In [1]:
translate(sample["audio"].copy())


NameError: name 'translate' is not defined

In [22]:
#source text
sample["raw_text"]

"Je crois que les dispositions que le compromis propose pour la participation des travailleurs vont à l'encontre de l'objectif recherché, c'est à dire à l'encontre de la facilité et de l'efficacité de ce statut pour les petites et moyennes entreprises."

## Pre-trained SpeechT5 TTS model for English TTS

In [23]:
#Text to Speech 
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [24]:
# GPU accelerator device
model.to(device)
vocoder.to(device)

SpeechT5HifiGan(
  (conv_pre): Conv1d(80, 512, kernel_size=(7,), stride=(1,), padding=(3,))
  (upsampler): ModuleList(
    (0): ConvTranspose1d(512, 256, kernel_size=(8,), stride=(4,), padding=(2,))
    (1): ConvTranspose1d(256, 128, kernel_size=(8,), stride=(4,), padding=(2,))
    (2): ConvTranspose1d(128, 64, kernel_size=(8,), stride=(4,), padding=(2,))
    (3): ConvTranspose1d(64, 32, kernel_size=(8,), stride=(4,), padding=(2,))
  )
  (resblocks): ModuleList(
    (0): HifiGanResidualBlock(
      (convs1): ModuleList(
        (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
        (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
      )
      (convs2): ModuleList(
        (0-2): 3 x Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      )
    )
    (1): HifiGanResidualBlock(
      (convs1): ModuleList(
        (0): Conv1d(256, 256,

In [25]:

# utterance dataset for  speaker embedding
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = embeddings_dataset[7306]["xvector"]
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)



In [26]:
#function text -> speech
# text -> SpeechT5 processor -> preprocessed text -> tokenized text (input ids) -> SpeechT5 model 
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        #-> tokenized text (input ids) -> SpeechT5 model 
        #placing each on the accelerator device if available
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [27]:
from IPython.display import Audio
speech = synthesise("Hey there! This is a test!")

Audio(speech, rate=16000)

# STST demo

In [28]:
import numpy as np

target_dtype = np.int16
max_range = np.iinfo(target_dtype).max

In [51]:
sample_text = sample["audio"]
sample_text

{'path': 'dev_part_0/20090309-0900-PLENARY-13-fr_20090309-20:37:55_7.wav',
 'array': array([-0.00018311, -0.00024414,  0.        , ...,  0.00915527,
         0.03625488,  0.02032471]),
 'sampling_rate': 16000}

In [52]:
def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    #show translated text 
    print(translated_text)
    translated_audio = synthesise(translated_text)
    # normalise audio array by the dynamic range of the target dtype
    # convert from the default NumPy dtype (float64) to the target dtype (int16)
    ## tensor -> array 
    ## -> maxrange (scale normalized wavelength value, ensures that the synthesized speech uses the full dynamic range of the target audio format )
    translated_audio = (translated_audio.numpy() * max_range).astype(np.int16)
    return translated_audio

In [53]:
translated_audio = speech_to_speech_translation(sample["audio"])
Audio(translated_audio, rate=16000)

 I believe that the proposals that the compromise offers for the participation of workers will meet the aim of looking for. That is, to meet the facility, the efficiency of this statue for small coins of the companies.
