In [None]:
!pip3 install boto3

In [None]:
!pip3 install azure-cognitiveservices-speech

In [None]:
!pip install google-cloud-texttospeech

In [1]:
import os
import re
import wave
import re
import time
import itertools

from getpass import getpass
from datetime import datetime
from tqdm.auto import tqdm
import IPython.display as ipd

import pandas as pd

import boto3
from boto3 import Session

import azure.cognitiveservices.speech as azure_speech

from google.cloud import texttospeech as gtts

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


## Functions

In [3]:
def text2ascii(text: str) -> str:
  """ Convert original text to ASCII-only. """

  # replace punctuation in the end of sentence with a code.
  # NOTE: '?!' or any other combination 
  # of punctuation in the end of sentence is not supported.
  ending = ''
  if text.endswith('.'):
    ending = '-D'
  elif text.endswith('!'):
    ending = '-E'
  elif text.endswith('?'):
    ending = '-Q'
  
  # remove punctuation.
  # can introduce multiple spaces when punctuation
  # is surrounded by spaces: "some - text".
  text = re.sub('[^\w ]', '', text)
  # replace whitespace sequencies with single '-'
  text = re.sub('\s+', '-', text)

  text = text + ending

  return text

In [4]:
def get_audio_out_fn(ts: str, text: str, gender_code: str, voice_code: str, provider: str):
  return f'{ts}_{text}_{gender_code}_{voice_code}@{provider}.wav'  

## Init data

In [5]:
ts = datetime.now().strftime('%Y%m%d-%H%M%S-%f')[:-3]

In [None]:
# root output directory
out_dp_root = f'/gdrive/MyDrive/tts_{ts}'
print(f'output data root: "{out_dp_root}"')
os.makedirs(out_dp_root, exist_ok=True)

In [7]:
# wakeup phrases

wakeup_words = ['Yo']

names_list_1 = ['Alice', 'James', 'Paul', 'Robert']
names_list_2 = ["Alice", "Angelica", "David", "Jason", "Jessica", "Julia", "Mary", "Max", "Paul", "Robert"]

In [None]:
texts = [f'{ww}, {name}.' for (ww, name) in itertools.product(wakeup_words, names_list_1)]
texts = [{
    'orig': t, 'ascii': text2ascii(t)
} for t in texts]
print(f'len(texts): {len(texts)}')

# create a texts dataframe and save it to .csv
texts_df = pd.DataFrame(texts)
texts_df = texts_df.rename_axis(index='text_id').reset_index()
out_fp = os.path.join(out_dp_root, 'texts.csv')
print(f'will save texts dataframe to: "{out_fp}"')
texts_df.to_csv(out_fp, index=False)

del texts

In [9]:
# subdirectories
for text in texts_df['ascii']:
  os.makedirs(os.path.join(out_dp_root, 'M', text), exist_ok=True)
  os.makedirs(os.path.join(out_dp_root, 'F', text), exist_ok=True)

## Credentials

In [10]:
AWS_ACCESS_KEY_ID = getpass('AWS_ACCESS_KEY_ID: ')
AWS_SECRET_ACCESS_KEY = getpass('AWS_SECRET_ACCESS_KEY: ')
AWS_REGION_NAME = getpass('AWS_REGION_NAME: ')

AZURE_SPEECH_KEY = getpass('AZURE_SPEECH_KEY: ')
AZURE_SERVICE_REGION = getpass('AZURE_SERVICE_REGION: ')

AWS_ACCESS_KEY_ID: ··········
AWS_SECRET_ACCESS_KEY: ··········
AWS_REGION_NAME: ··········
AZURE_SPEECH_KEY: ··········
AZURE_SERVICE_REGION: ··········


In [11]:
boto3_session = Session(
  aws_access_key_id = AWS_ACCESS_KEY_ID,
  aws_secret_access_key = AWS_SECRET_ACCESS_KEY,
  region_name = AWS_REGION_NAME
)

In [12]:
gtts_client = gtts.TextToSpeechClient.from_service_account_json(
    '/gdrive/MyDrive/keys/key.json'
)

# AWS

In [13]:
class AmazonTTS:
  def __init__(self, session: boto3.Session):
    self.synthesizer = session.client('polly')

  @staticmethod
  def get_english_neural_voices() -> pd.DataFrame:
    # from: https://docs.aws.amazon.com/polly/latest/dg/ntts-voices-main.html
    voices_str = '''
    en-AU Olivia female
    en-GB Amy female
    en-GB Emma female
    en-GB Brian male
    en-GB Arthur male
    en-IN Kajal female
    en-NZ Aria female
    en-ZA Ayanda female
    en-US Ivy female
    en-US Joanna female
    en-US Kendra female
    en-US Kimberly female
    en-US Salli female
    en-US Joey male
    en-US Justin male
    en-US Kevin male
    en-US Matthew male
    '''
    voices_str = voices_str.strip().split('\n')
    voices_str = list(map(str.strip, voices_str))

    colnames = ['language_code','voice_name','gender']
    voices_df = pd.DataFrame([{
        key: val for key, val in zip(colnames, v.split())
    } for v in voices_str])

    print(f'Found {voices_df.shape[0]} english neural voices_df')
    print(f'Distribution of gender: {voices_df["gender"].value_counts().to_dict()}')
    print(f'Distribution of english variants: {voices_df["language_code"].value_counts().to_dict()}')

    return voices_df

  @staticmethod
  def get_pure_name(language_code, voice_name) -> str:
      return f'{language_code}-{voice_name}'

  def speak(
      self,
      text: str,
      language_code: str,
      voice_name: str,
      engine: str,
  ) -> bytes:
  
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/polly.html#Polly.Client.synthesize_speech
  
    assert engine in ('neural', 'standard')

    response = self.synthesizer.synthesize_speech(
        Text=text, VoiceId=voice_name,
        Engine=engine, LanguageCode=language_code,
        OutputFormat='pcm',
        SampleRate='16000',
    )
    
    if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
      raise ValueError(response)
    
    audio_bytes = response['AudioStream'].read()
    return audio_bytes
      
  def save_bytes(self, audiostream: bytes, out_fp: str, verbose=False):
    if verbose is True:
      print(f'Saving audio to: "{out_fp}"')
    with wave.open(out_fp, 'wb') as fout:
      # (nchannels, sampwidth, framerate, nframes, comptype, compname)
      fout.setparams((1, 2, 16000, 0, 'NONE', 'NONE'))
      fout.writeframes(audiostream)

  def speak_and_save(
      self,
      text: str,
      language_code: str,
      voice_name: str,
      engine: str,
      out_fp: str,
      verbose=False
  ):
    audio_bytes = self.speak(text=text, language_code=language_code, voice_name=voice_name, engine=engine)
    self.save_bytes(audio_bytes, out_fp=out_fp, verbose=verbose)

In [14]:
aws_tts = AmazonTTS(session=boto3_session)

In [15]:
aws_selected_voices = aws_tts.get_english_neural_voices()
aws_selected_voices = aws_selected_voices.sort_values(
    ['gender', 'language_code', 'voice_name'], ascending=[False, True, True]
)

aws_selected_voices = aws_selected_voices.to_dict(orient='records')
print(len(aws_selected_voices))

Found 17 english neural voices_df
Distribution of gender: {'female': 11, 'male': 6}
Distribution of english variants: {'en-US': 9, 'en-GB': 4, 'en-AU': 1, 'en-IN': 1, 'en-NZ': 1, 'en-ZA': 1}
17


### Synthesize

In [16]:
for _, text in tqdm(texts_df.iterrows(), total=texts_df.shape[0], desc='phrases'):
  for voice in tqdm(aws_selected_voices, desc='voices', leave=False):

    gender_code = voice['gender'][0].upper()
    out_fn = get_audio_out_fn(
        ts=ts, 
        text=text['ascii'], 
        gender_code=gender_code, 
        voice_code=f'{voice["language_code"]}-{voice["voice_name"]}', 
        provider='aws'
    )
    out_fp = os.path.join(out_dp_root, gender_code, text['ascii'], out_fn)
    
    aws_tts.speak_and_save(
        text=text['orig'], language_code=voice['language_code'], 
        voice_name=voice['voice_name'], engine='neural', 
        out_fp=out_fp, verbose=False
    )

phrases:   0%|          | 0/4 [00:00<?, ?it/s]

voices:   0%|          | 0/17 [00:00<?, ?it/s]

voices:   0%|          | 0/17 [00:00<?, ?it/s]

voices:   0%|          | 0/17 [00:00<?, ?it/s]

voices:   0%|          | 0/17 [00:00<?, ?it/s]

# Azure

In [17]:
class AzureTTS:
  def __init__(self, azure_speech_key: str, azure_service_region: str):
    self.speech_config = azure_speech.SpeechConfig(
      subscription=azure_speech_key,
      region=azure_service_region
    )

    # set output audio format
    self.speech_config.set_speech_synthesis_output_format(
      azure_speech.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm
    )

  def get_available_voices(self, locale: str) -> pd.DataFrame:
    synthesizer = azure_speech.SpeechSynthesizer(speech_config=self.speech_config, audio_config=None)
    result = synthesizer.get_voices_async(locale).get()

    if result.reason != azure_speech.ResultReason.VoicesListRetrieved:
      raise(f'Speech synthesis canceled; error details: {result.error_details}')
    
    print(f'Found {len(result.voices)} available voices for passed "{locale}" locale.')
    df = pd.DataFrame([{
      'long_name': y.name,
      'locale': y.locale,
      'locale_and_name': y.short_name,
      'style_list': str(sorted(y.style_list)),
      'voice_type': y.voice_type.name,
      'gender': y.gender.name
    } for y in result.voices])

    df['name'] = df.apply(lambda row: re.sub(f'^{row["locale"]}-', '', row['locale_and_name']), axis=1)
    df['is_neural'] = (df['voice_type'] == 'OnlineNeural')
    return df
  
  def get_english_neural_voices(self) -> pd.DataFrame:
    all_voices_df = self.get_available_voices(locale='')

    selected_voices_df = all_voices_df[all_voices_df['locale'].str.startswith('en-')]
    print(f'\nFound {selected_voices_df.shape[0]} english voices')
    print(f'Distribution of neural/non-neural voice types:')
    print(selected_voices_df['is_neural'].value_counts().to_dict())

    selected_voices_df = selected_voices_df[selected_voices_df['is_neural']]
    print(f'\nFound {selected_voices_df.shape[0]} english neural voices')
    print(f'Distribution of gender: {selected_voices_df["gender"].value_counts().to_dict()}')
    print(f'Distribution of english variants: {selected_voices_df["locale"].value_counts().to_dict()}')

    return selected_voices_df

  def speak_and_save(
      self,
      text: str,
      voice_name: str,
      out_fp: str,
      verbose=False
  ):

    # https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechsynthesizer

    # TODO: figure out how to use styles available for a voice name

    if verbose is True:
      print(f'Saving audio to: "{out_fp}"')

    self.speech_config.speech_synthesis_voice_name = voice_name
    audio_config = azure_speech.AudioConfig(filename=out_fp)
    
    synthesizer = azure_speech.SpeechSynthesizer(
        speech_config=self.speech_config, audio_config=audio_config
    )
    # synthesizer.speak_text(text) causes a crash. (swig::stop_iteration)
    # that's why use async functions instead of syncronous
    synthesizer.speak_text_async(text)

In [18]:
azure_tts = AzureTTS(azure_speech_key=AZURE_SPEECH_KEY, azure_service_region=AZURE_SERVICE_REGION)

In [19]:
azure_selected_voices = azure_tts.get_english_neural_voices()
azure_selected_voices = azure_selected_voices.sort_values(
    ['gender', 'locale_and_name'], ascending=[False, True]
)

azure_selected_voices = azure_selected_voices[[
    'long_name', 'locale_and_name', 'gender']].to_dict(orient='records')
print(len(azure_selected_voices))

Found 488 available voices for passed "" locale.

Found 78 english voices
Distribution of neural/non-neural voice types:
{True: 63, False: 15}

Found 63 english neural voices
Distribution of gender: {'Female': 34, 'Male': 29}
Distribution of english variants: {'en-US': 24, 'en-GB': 15, 'en-AU': 2, 'en-CA': 2, 'en-HK': 2, 'en-IE': 2, 'en-IN': 2, 'en-KE': 2, 'en-NG': 2, 'en-NZ': 2, 'en-PH': 2, 'en-SG': 2, 'en-TZ': 2, 'en-ZA': 2}
63


### Synthesize

In [20]:
timeout = 0.5

for _, text in tqdm(texts_df.iterrows(), total=texts_df.shape[0], desc='phrases'):
  for voice in tqdm(azure_selected_voices, desc='voices', leave=False):

    gender_code = voice['gender'][0].upper()
    out_fn = get_audio_out_fn(
        ts=ts, 
        text=text['ascii'], 
        gender_code=gender_code, 
        voice_code=voice['locale_and_name'], 
        provider='azure'
    )
    out_fp = os.path.join(out_dp_root, gender_code, text['ascii'], out_fn)
  
    azure_tts.speak_and_save(
        text=text['orig'], voice_name=voice['long_name'], 
        out_fp=out_fp, verbose=False
    )
    time.sleep(timeout)

phrases:   0%|          | 0/4 [00:00<?, ?it/s]

voices:   0%|          | 0/63 [00:00<?, ?it/s]

voices:   0%|          | 0/63 [00:00<?, ?it/s]

voices:   0%|          | 0/63 [00:00<?, ?it/s]

voices:   0%|          | 0/63 [00:00<?, ?it/s]

# Google TTS

In [21]:
class GoogleTTS:
  def __init__(self, tts_client: gtts.TextToSpeechClient):
    self.tts_client = tts_client

  def get_available_voices(self, language_code: str) -> pd.DataFrame:
      response = self.tts_client.list_voices(language_code=language_code)
      voices = response.voices
      print(f'Found {len(voices)} available voices for passed "{language_code}" language code')

      voices_df = pd.DataFrame([{
          'name': v.name,
          'gender': v.ssml_gender.name,
          'natural_sample_rate_hertz': v.natural_sample_rate_hertz,
          'n_language_codes': len(v.language_codes),
          'language_codes': v.language_codes
      } for v in voices])

      x = voices_df['name'].str.split('-', expand=True)
      assert x.shape[1] == 4
      x.columns = ['lc1', 'lc2', 'voice_type', 'voice_index']

      voices_df['primary_language_code'] = x['lc1'] + '-' + x['lc2']
      voices_df['voice_type'] = x['voice_type']
      del x

      return voices_df

  def get_english_neural_voices(self) -> pd.DataFrame:
    all_voices_df = self.get_available_voices(language_code='')

    selected_voices_df = all_voices_df[
        all_voices_df['primary_language_code'].str.startswith('en-')
    ]
    print(f'\nFound {selected_voices_df.shape[0]} english voices')
    print(f'Distribution of voice types for english voices:')
    print(selected_voices_df['voice_type'].value_counts().to_dict())

    selected_voices_df = selected_voices_df[
        selected_voices_df['voice_type'].isin(['Neural2', 'Wavenet'])
    ]
    print(f'\nFound {selected_voices_df.shape[0]} english neural voices')
    print(f'Distribution of gender: {selected_voices_df["gender"].value_counts().to_dict()}')
    print(f'Distribution of english variants: {selected_voices_df["primary_language_code"].value_counts().to_dict()}')

    return selected_voices_df

  def speak_and_save(self, text, voice_name: str, out_fp: str, verbose=False):
      text_input = gtts.SynthesisInput(text=text)
      
      language_code = '-'.join(voice_name.split('-')[:2])
      voice_params = gtts.VoiceSelectionParams(
          language_code=language_code, 
          name=voice_name
      )
      
      audio_config = gtts.AudioConfig(audio_encoding=gtts.AudioEncoding.LINEAR16, sample_rate_hertz=16000)

      # Synthesizes speech synchronously: receive results after all text input has been processed.
      # doc: https://cloud.google.com/python/docs/reference/texttospeech/latest/google.cloud.texttospeech_v1.services.text_to_speech.TextToSpeechClient#google_cloud_texttospeech_v1_services_text_to_speech_TextToSpeechClient_synthesize_speech
      response = self.tts_client.synthesize_speech(
          input=text_input, voice=voice_params, audio_config=audio_config
      )

      if verbose is True:
        print(f'Saving audio to: "{out_fp}"')
      with open(out_fp, 'wb') as out:
          out.write(response.audio_content)

In [22]:
google_tts = GoogleTTS(tts_client=gtts_client)

In [23]:
google_selected_voices = google_tts.get_english_neural_voices()
google_selected_voices = google_selected_voices.sort_values(
    ['gender', 'name'], ascending=[False, True]
)

google_selected_voices = google_tts.get_english_neural_voices()
google_selected_voices = google_selected_voices[[
    'name', 'gender']].to_dict(orient='records')
print(len(google_selected_voices))

Found 337 available voices for passed "" language code

Found 55 english voices
Distribution of voice types for english voices:
{'Standard': 27, 'Wavenet': 23, 'Neural2': 5}

Found 28 english neural voices
Distribution of gender: {'FEMALE': 16, 'MALE': 12}
Distribution of english variants: {'en-US': 11, 'en-GB': 7, 'en-AU': 6, 'en-IN': 4}
Found 337 available voices for passed "" language code

Found 55 english voices
Distribution of voice types for english voices:
{'Standard': 27, 'Wavenet': 23, 'Neural2': 5}

Found 28 english neural voices
Distribution of gender: {'FEMALE': 16, 'MALE': 12}
Distribution of english variants: {'en-US': 11, 'en-GB': 7, 'en-AU': 6, 'en-IN': 4}
28


### Synthesize

In [24]:
for _, text in tqdm(texts_df.iterrows(), total=texts_df.shape[0], desc='phrases'):
  for voice in tqdm(google_selected_voices, desc='voices', leave=False):

    gender_code = voice['gender'][0].upper()
    out_fn = get_audio_out_fn(
        ts=ts, 
        text=text['ascii'], 
        gender_code=gender_code, 
        voice_code=voice['name'], 
        provider='google'
    )
    out_fp = os.path.join(out_dp_root, gender_code, text['ascii'], out_fn)
    
    google_tts.speak_and_save(
        text=text['orig'], voice_name=voice['name'], 
        out_fp=out_fp, verbose=False
    )

phrases:   0%|          | 0/4 [00:00<?, ?it/s]

voices:   0%|          | 0/28 [00:00<?, ?it/s]

voices:   0%|          | 0/28 [00:00<?, ?it/s]

voices:   0%|          | 0/28 [00:00<?, ?it/s]

voices:   0%|          | 0/28 [00:00<?, ?it/s]

# Check that all audiofiles are saved in the same format

In [25]:
from glob import glob
import soundfile

In [None]:
pat = os.path.join(out_dp_root, '**', '**', '*.wav')
audiofiles_fps = glob(pat)
print(f'found: {len(audiofiles_fps)} audiofiles using pattern: "{pat}"')

In [27]:
stats = []
for fp in tqdm(audiofiles_fps):
  audio_info = soundfile.info(fp)
  cur_stats = {
      'sr': audio_info.samplerate, 'subtype': audio_info.subtype, 
      'n_channels': audio_info.channels, 'fp': fp
  }
  stats.append(cur_stats)
stats = pd.DataFrame(stats)
print(stats.shape)

  0%|          | 0/432 [00:00<?, ?it/s]

(432, 4)


In [28]:
stats_final = stats.drop(columns='fp').value_counts().rename('n_audiofiles').to_frame()
print(f'number of audiofiles per each audio format:')
stats_final

number of audiofiles per each audio format:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_audiofiles
sr,subtype,n_channels,Unnamed: 3_level_1
16000,PCM_16,1,432


In [29]:
if stats_final.shape[0] == 1:
  print(f'All audiofiles share the same audio format')
else:
  raise ValueError('Multiple audio formats are present in the data')

All audiofiles share the same audio format


# EDA

## Compare TTS for ".", "!", "?" and None in the end of a phrase

In [121]:
vn = azure_selected_voices[21]['long_name']
vn

'Microsoft Server Speech Text to Speech Voice (en-US, DavisNeural)'

In [122]:
azure_tts.speak_and_save(text='Hello, Alice', voice_name=vn, out_fp='tmp.wav', verbose=True)
azure_tts.speak_and_save(text='Hello, Alice.', voice_name=vn, out_fp='tmp-D.wav', verbose=True)
azure_tts.speak_and_save(text='Hello, Alice!', voice_name=vn, out_fp='tmp-E.wav', verbose=True)
azure_tts.speak_and_save(text='Hello, Alice?', voice_name=vn, out_fp='tmp-Q.wav', verbose=True)

Saving audio to: "tmp.wav"
Saving audio to: "tmp-D.wav"
Saving audio to: "tmp-E.wav"
Saving audio to: "tmp-Q.wav"


In [129]:
ipd.Audio('tmp.wav', autoplay=True)

In [130]:
ipd.Audio('tmp-D.wav', autoplay=True)

In [127]:
ipd.Audio('tmp-E.wav', autoplay=True)

In [128]:
ipd.Audio('tmp-Q.wav', autoplay=True)

## Azure

#### Speech Synthesis Output Formats

In [30]:
format_names = pd.DataFrame({'raw': [x.name for x in azure_speech.SpeechSynthesisOutputFormat]})
print(f'format_names.shape: {format_names.shape}')

parsed = format_names['raw'].str.extract(r'(?P<type>\w+?)(?P<sr>\d+\w+?)(\d+\w+)(?P<mono>[A-Z]+\w+)(?P<ext>[A-Z]\w+)$')
format_names = pd.concat([format_names, parsed], axis=1)

format_names

format_names.shape: (37, 1)


Unnamed: 0,raw,type,sr,2,mono,ext
0,Raw8Khz8BitMonoMULaw,Raw,8Khz,8BitMono,MU,Law
1,Riff16Khz16KbpsMonoSiren,Riff,16Khz,16Kbps,Mono,Siren
2,Audio16Khz16KbpsMonoSiren,Audio,16Khz,16Kbps,Mono,Siren
3,Audio16Khz32KBitRateMonoMp3,Audio,16Khz,32KBitRate,Mono,Mp3
4,Audio16Khz128KBitRateMonoMp3,Audio,16Khz,128KBitRate,Mono,Mp3
5,Audio16Khz64KBitRateMonoMp3,Audio,16Khz,64KBitRate,Mono,Mp3
6,Audio24Khz48KBitRateMonoMp3,Audio,24Khz,48KBitRate,Mono,Mp3
7,Audio24Khz96KBitRateMonoMp3,Audio,24Khz,96KBitRate,Mono,Mp3
8,Audio24Khz160KBitRateMonoMp3,Audio,24Khz,160KBitRate,Mono,Mp3
9,Raw16Khz16BitMonoTrueSilk,Raw,16Khz,16BitMono,True,Silk


In [31]:
format_names['sr'].value_counts()

16Khz      11
24Khz      11
8Khz        6
48Khz       5
22050Hz     2
44100Hz     2
Name: sr, dtype: int64

In [32]:
format_names[format_names['sr'] == '16Khz'].sort_values('type')

Unnamed: 0,raw,type,sr,2,mono,ext
2,Audio16Khz16KbpsMonoSiren,Audio,16Khz,16Kbps,Mono,Siren
3,Audio16Khz32KBitRateMonoMp3,Audio,16Khz,32KBitRate,Mono,Mp3
4,Audio16Khz128KBitRateMonoMp3,Audio,16Khz,128KBitRate,Mono,Mp3
5,Audio16Khz64KBitRateMonoMp3,Audio,16Khz,64KBitRate,Mono,Mp3
30,Audio16Khz16Bit32KbpsMonoOpus,Audio,16Khz,16Bit32Kbps,Mono,Opus
17,Ogg16Khz16BitMonoOpus,Ogg,16Khz,16Bit,Mono,Opus
9,Raw16Khz16BitMonoTrueSilk,Raw,16Khz,16BitMono,True,Silk
14,Raw16Khz16BitMonoPcm,Raw,16Khz,16Bit,Mono,Pcm
1,Riff16Khz16KbpsMonoSiren,Riff,16Khz,16Kbps,Mono,Siren
10,Riff16Khz16BitMonoPcm,Riff,16Khz,16Bit,Mono,Pcm


#### All available voices

In [33]:
azure_voices_all_df = azure_tts.get_available_voices(locale='')
print(f'df_azure_voices_all.shape: {azure_voices_all_df.shape}')
azure_voices_all_df.tail()

Found 488 available voices for passed "" locale.
df_azure_voices_all.shape: (488, 8)


Unnamed: 0,long_name,locale,locale_and_name,style_list,voice_type,gender,name,is_neural
483,Microsoft Server Speech Text to Speech Voice (...,zh-TW,zh-TW-HanHanRUS,[],OnlineStandard,Female,HanHanRUS,False
484,Microsoft Server Speech Text to Speech Voice (...,zh-TW,zh-TW-Yating,[],OnlineStandard,Female,Yating,False
485,Microsoft Server Speech Text to Speech Voice (...,zh-TW,zh-TW-Zhiwei,[],OnlineStandard,Male,Zhiwei,False
486,Microsoft Server Speech Text to Speech Voice (...,zu-ZA,zu-ZA-ThandoNeural,[],OnlineNeural,Female,ThandoNeural,True
487,Microsoft Server Speech Text to Speech Voice (...,zu-ZA,zu-ZA-ThembaNeural,[],OnlineNeural,Male,ThembaNeural,True


### Number of voices per English language variants (neural + non-neural)

In [34]:
x = azure_voices_all_df[azure_voices_all_df['locale'].str.startswith('en-')]
x = x.groupby(['locale', 'is_neural']).size().unstack().fillna(0).astype('int')
x = pd.concat([x, x.sum(axis=0).to_frame().T.rename(index={0: 'total'})], axis=0)
x

is_neural,False,True
en-AU,2,2
en-CA,2,2
en-GB,3,15
en-HK,0,2
en-IE,1,2
en-IN,3,2
en-KE,0,2
en-NG,0,2
en-NZ,0,2
en-PH,0,2


### Non-neural voices that do not end with 'RUS'

In [35]:
pd.crosstab(azure_voices_all_df['is_neural'], azure_voices_all_df['name'].str.endswith('RUS').rename('endswith_rus'))

endswith_rus,False,True
is_neural,Unnamed: 1_level_1,Unnamed: 2_level_1
False,47,30
True,411,0


In [36]:
x = sorted(azure_voices_all_df[
    (~azure_voices_all_df['is_neural']) & 
    (~azure_voices_all_df['name'].str.endswith('RUS'))
]['locale_and_name'].tolist())
print(x)

['ar-EG-Hoda', 'ar-SA-Naayf', 'bg-BG-Ivan', 'cs-CZ-Jakub', 'de-AT-Michael', 'de-CH-Karsten', 'de-DE-Stefan', 'el-GR-Stefanos', 'en-AU-Catherine', 'en-CA-Linda', 'en-GB-George', 'en-GB-Susan', 'en-IE-Sean', 'en-IN-Heera', 'en-IN-Ravi', 'es-ES-Laura', 'es-ES-Pablo', 'es-MX-Raul', 'fr-CA-Caroline', 'fr-CH-Guillaume', 'fr-FR-Julie', 'fr-FR-Paul', 'he-IL-Asaf', 'hi-IN-Hemant', 'hi-IN-Kalpana', 'hr-HR-Matej', 'hu-HU-Szabolcs', 'id-ID-Andika', 'it-IT-Cosimo', 'ja-JP-Ayumi', 'ja-JP-Ichiro', 'ms-MY-Rizwan', 'pt-BR-Daniel', 'ro-RO-Andrei', 'ru-RU-Irina', 'ru-RU-Pavel', 'sk-SK-Filip', 'sl-SI-Lado', 'ta-IN-Valluvar', 'te-IN-Chitra', 'th-TH-Pattara', 'vi-VN-An', 'zh-CN-Kangkang', 'zh-CN-Yaoyao', 'zh-HK-Danny', 'zh-TW-Yating', 'zh-TW-Zhiwei']


In [37]:
azure_voices_all_df[azure_voices_all_df['is_neural']]['name'].str[-6:].value_counts()

Neural    411
Name: name, dtype: int64

## Google

### All available voices

In [38]:
google_voices_df = google_tts.get_available_voices(language_code='')
google_voices_df.head()

Found 337 available voices for passed "" language code


Unnamed: 0,name,gender,natural_sample_rate_hertz,n_language_codes,language_codes,primary_language_code,voice_type
0,ar-XA-Wavenet-A,FEMALE,24000,1,[ar-XA],ar-XA,Wavenet
1,ar-XA-Wavenet-B,MALE,24000,1,[ar-XA],ar-XA,Wavenet
2,ar-XA-Wavenet-C,MALE,24000,1,[ar-XA],ar-XA,Wavenet
3,ar-XA-Wavenet-D,FEMALE,24000,1,[ar-XA],ar-XA,Wavenet
4,bn-IN-Wavenet-A,FEMALE,24000,1,[bn-IN],bn-IN,Wavenet


In [39]:
google_voices_df['voice_type'].value_counts()

Standard    174
Wavenet     156
Neural2       7
Name: voice_type, dtype: int64

In [40]:
google_voices_df['n_language_codes'].value_counts()

1    337
Name: n_language_codes, dtype: int64

In [41]:
google_voices_df['natural_sample_rate_hertz'].value_counts()

24000    337
Name: natural_sample_rate_hertz, dtype: int64

In [42]:
google_voices_df['gender'].value_counts()

FEMALE    191
MALE      146
Name: gender, dtype: int64