In [1]:
'''
Speaker Model - For multiple speakers 
To enable the TTS model to differentiate between multiple speakers, you’ll need to create a speaker embedding for each example. 
The speaker embedding is an additional input into the model that captures a particular speaker’s voice characteristics. 


Dataset format - 
SpeechT5Processor object to tokenize the input text and load the target audio into a log-mel spectrogram. 
It should also add the speaker embeddings as an additional input.

- Audio inputs are padded by SpeechT5FeatureExtractor’s pad().
- Text inputs are padded by SpeechT5Tokenizer’s pad().

'''

'\nSpeaker Model - For multiple speakers \nTo enable the TTS model to differentiate between multiple speakers, you’ll need to create a speaker embedding for each example. \nThe speaker embedding is an additional input into the model that captures a particular speaker’s voice characteristics. \n\n\nDataset format - \n\n'

In [3]:
from transformers import SpeechT5Processor

# general processor 
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")


'''
input_ids only
input_values only
labels only, either log-mel spectrograms or text tokens
input_ids and log-mel spectrogram labels
input_values and text labels
'''



'\ninput_ids only\ninput_values only\nlabels only, either log-mel spectrograms or text tokens\ninput_ids and log-mel spectrogram labels\ninput_values and text labels\n'

In [23]:
tokenizer = processor.tokenizer
feature_extractor = processor.feature_extractor

In [24]:
tokenizer

SpeechT5Tokenizer(name_or_path='microsoft/speecht5_tts', vocab_size=79, model_max_length=600, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	79: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
	80: AddedToken("<ctc_blank>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}

In [25]:
feature_extractor

SpeechT5FeatureExtractor {
  "do_normalize": false,
  "feature_extractor_type": "SpeechT5FeatureExtractor",
  "feature_size": 1,
  "fmax": 7600,
  "fmin": 80,
  "frame_signal_scale": 1.0,
  "hop_length": 16,
  "mel_floor": 1e-10,
  "num_mel_bins": 80,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "SpeechT5Processor",
  "reduction_factor": 2,
  "return_attention_mask": true,
  "sampling_rate": 16000,
  "win_function": "hann_window",
  "win_length": 64
}

In [17]:
import torch 
input_ids = [{"input_ids": torch.tensor([1, 2, 3, 4])}]
tokenizer.pad(input_ids, padding = True)

{'input_ids': tensor([[1, 2, 3, 4]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [28]:
tokenizer(text = 'This is a string', padding = 'max_length')

{'input_ids': [4, 32, 11, 10, 12, 4, 10, 12, 4, 7, 4, 12, 6, 13, 10, 9, 21, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

  labels = torch.tensor([[[-4.6002865  -4.979547   -4.5412726  -5.4348593  -5.4099646
  labels = torch.tensor([[[-4.6002865  -4.979547   -4.5412726  -5.4348593  -5.4099646
  labels = torch.tensor([[[-4.6002865  -4.979547   -4.5412726  -5.4348593  -5.4099646
  labels = torch.tensor([[[-4.6002865  -4.979547   -4.5412726  -5.4348593  -5.4099646
  labels = torch.tensor([[[-4.6002865  -4.979547   -4.5412726  -5.4348593  -5.4099646
  labels = torch.tensor([[[-4.6002865  -4.979547   -4.5412726  -5.4348593  -5.4099646
  labels = torch.tensor([[[-4.6002865  -4.979547   -4.5412726  -5.4348593  -5.4099646


TypeError: list indices must be integers or slices, not float

In [None]:
feature_extractor()