## **Text To Speech Model Selection**

### **Criteria of Selection**

1. **English & Arabic Support**
2. **Performance (Manually)**
3. **Online vs Offline**
4. **Response Time**
5. **Cost**

In [None]:
import pyttsx3
import time
import os
from gtts import gTTS
from vosk_tts import Model, Synth
from ibm_watson import TextToSpeechV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from transformers import VitsModel, AutoTokenizer, AutoProcessor, SeamlessM4Tv2Model
from IPython.display import Audio
import vosk
from vosk_tts import Model, Synth
import torchaudio
import os

In [5]:
text_ar = " كان محمد نجيب أول رئيس لمصر. فقد تولى رئاسة مصر في 18 يونيو 1953 بعد إعلان الجمهورية وإلغاء النظام الملكي. كانت فترة رئاسته قصيرة، حيث شغل المنصب حتى 14 نوفمبر 1954."
text_en = 'Normal Distribution - The return on investment for a stock over the next year can be modeled using a normal distribution if the ROI follows a bell-shaped curve and is influenced by many random factors.'

--------------------------------------------------

### **1. PYTTSX3**

1. **English & Arabic Support:**
   - **Support English Only**
2. **Performance (Manually):**
   - **Good**
3. **Online vs Offline:**
   - **Offline**
4. **Response Time**
   -  **11s**
5. **Cost**
   - **Free**

**English**

In [None]:
start = time.time()

engine = pyttsx3.init()
engine.say(text_en)
engine.runAndWait()

end = time.time()

exec_time = end - start
exec_time

10.742548942565918

In [None]:
def TTS_pyttsx3(text):
    engine = pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

**Arabic**

In [None]:
engine = pyttsx3.init()

for voice in engine.getProperty('voices'):
    print(voice)

<Voice id=HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_DAVID_11.0
          name=Microsoft David Desktop - English (United States)
          languages=[]
          gender=None
          age=None>
<Voice id=HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\Voices\Tokens\TTS_MS_EN-US_ZIRA_11.0
          name=Microsoft Zira Desktop - English (United States)
          languages=[]
          gender=None
          age=None>


- Note: Pyttsx3 doesn't support Arabic language

-----------------------------------------------------------------------

### **2. GTTS**

1. **English & Arabic Support**
   - **Support English & Arabic**
2. **Performance (Manually)**
   - **Good**
3. **Online vs Offline**
   - **Online**
4. **Response Time**
   -  **4.5**
5. **Cost**
   - **Free**

**English**

In [None]:
start = time.time()

gtts_obj = gTTS(text=text_en, lang='en', slow=False)
gtts_obj.save("text.mp3")
os.system("text.mp3")

end = time.time()

exec_time = end - start
exec_time

7.166924238204956

In [None]:
Audio(filename="text.mp3", rate=44100)

**Arabic**

In [None]:
start = time.time()

language = 'ar'
gtts_obj = gTTS(text=text_ar, lang=language, slow=False)
gtts_obj.save("text.mp3")
os.system("text.mp3")

end = time.time()

exec_time = end - start
exec_time

4.296618700027466

**Integration**

In [None]:
def TTS_gtts(text, lang):
    gtts_obj = gTTS(text=text, lang=lang, slow=False)
    gtts_obj.save("text.mp3")
    os.system("text.mp3")

In [None]:
TTS_gtts(text_en, 'en')
TTS_gtts(text_ar, 'ar')

------------------------------------------------------------------------

### **3. Coque TTS**

- https://github.com/coqui-ai/TTS
- https://huggingface.co/coqui/XTTS-v2

1. **English & Arabic Support:**
   - **Support English & Arabic**
2. **Performance (Manually):**
   - **Very Good**
3. **Online vs Offline:**
   - **Offline**
4. **Response Time**
   -  **High: 111**
5. **Cost**
   - **Free**

#### **Under Tuning !!!!!!!!**

In [None]:
pip install TTS

In [None]:
import torch
from TTS.api import TTS

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]
 | | > y
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2


100%|█████████▉| 1.86G/1.87G [00:40<00:00, 39.0MiB/s]
100%|██████████| 1.87G/1.87G [00:41<00:00, 45.4MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 4.59kiB/s]
 37%|███▋      | 132k/361k [00:00<00:00, 1.19MiB/s]
100%|██████████| 361k/361k [00:00<00:00, 383kiB/s] 
100%|██████████| 32.0/32.0 [00:01<00:00, 28.1iB/s]
 94%|█████████▍| 7.33M/7.75M [00:00<00:00, 16.9MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
 > Using model: xtts


100%|██████████| 7.75M/7.75M [00:15<00:00, 16.9MiB/s]

**English**

In [None]:
start = time.time()

tts.tts_to_file(
    text=text_en,
    file_path="text.mp3",
    speaker="Ana Florence",
    language="en",
    split_sentences=True
)
os.system("text.mp3")

end = time.time()

exec_time = end - start
exec_time

 > Text splitted to sentences.
['Normal Distribution - The return on investment for a stock over the next year can be modeled using a normal distribution if the ROI follows a bell-shaped curve and is influenced by many random factors.']
 > Processing time: 111.50940752029419
 > Real-time factor: 7.433555954090138


111.5654525756836

In [None]:
Audio(filename="text.mp3", rate=44100)

**Arabic**

In [None]:
start = time.time()

tts.tts_to_file(
    text=text_ar,
    file_path="text.mp3",
    speaker="Ana Florence",
    language="ar",
    split_sentences=True
)

os.system("text.mp3")

end = time.time()

exec_time = end - start
exec_time

 > Text splitted to sentences.
['كان محمد نجيب أول رئيس لمصر.', 'فقد تولى رئاسة مصر في 18 يونيو 1953 بعد إعلان الجمهورية وإلغاء النظام الملكي.', 'كانت فترة رئاسته قصيرة، حيث شغل المنصب حتى 14 نوفمبر 1954.']
 > Processing time: 143.11889219284058
 > Real-time factor: 6.215868497786341


143.20614194869995

In [None]:
Audio(filename="text.mp3", rate=44100)

**Trials To Make The Model Faster**

In [None]:
import io
from pydub import AudioSegment
import simpleaudio as sa
from TTS.api import TTS


def stream_tts(text, speaker="Ana Florence", language="ar"):

    audio_buffer = io.BytesIO()
    tts.tts_to_file(text=text, file_path=audio_buffer, speaker=speaker, language=language)
    audio_buffer.seek(0)

    audio = AudioSegment.from_file(audio_buffer, format="mp3")

    wav_buffer = io.BytesIO()
    audio.export(wav_buffer, format="wav")
    wav_buffer.seek(0)

    play_obj = sa.play_buffer(wav_buffer.read(), num_channels=audio.channels, bytes_per_sample=audio.sample_width, sample_rate=audio.frame_rate)
    play_obj.wait_done()

stream_tts(text_ar)


- It didn't work

In [None]:
!git clone https://huggingface.co/coqui/XTTS-v2

Cloning into 'XTTS-v2'...
remote: Enumerating objects: 161, done.[K
remote: Counting objects: 100% (161/161), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 161 (delta 69), reused 161 (delta 69), pack-reused 0 (from 0)[K
Receiving objects: 100% (161/161), 2.29 MiB | 9.33 MiB/s, done.
Resolving deltas: 100% (69/69), done.
Filtering content: 100% (4/4), 1.94 GiB | 18.06 MiB/s, done.


In [None]:
import os
import time
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

print("Loading model...")
config = XttsConfig()
config.load_json("/content/XTTS-v2/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/content/XTTS-v2")

print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])

print("Inference...")
t0 = time.time()
chunks = model.inference_stream(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    "en",
    gpt_cond_latent,
    speaker_embedding
)

wav_chuncks = []
for i, chunk in enumerate(chunks):
    if i == 0:
        print(f"Time to first chunck: {time.time() - t0}")
    print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
    wav_chuncks.append(chunk)
wav = torch.cat(wav_chuncks, dim=0)
torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)

Loading model...


In [None]:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

config = XttsConfig()
config.load_json("/content/XTTS-v2/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/content/XTTS-v2", eval=True)

outputs = model.synthesize(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    config,
    speaker_wav="/data/TTS-public/_refclips/3.wav",
    gpt_cond_len=3,
    language="en",
)

**Integration**

In [None]:
def coque_tts(text, lang):
    tts.tts_to_file(text=text,
                file_path="output.wav",
                speaker="Ana Florence",
                language=lang,
                split_sentences=True
                )
    os.system("output.wav")

In [None]:
coque_tts(text_en, 'en')
coque_tts(text_ar, 'ar')

----------------------------

### **4. VOSK**

- https://alphacephei.com/vosk/lm
- https://alphacephei.com/vosk/models
- https://github.com/alphacep/vosk-tts
- https://huggingface.co/alphacep/vosk-tts-ru-multi
- https://pypi.org/project/vosk-tts/

1. **English & Arabic Support**
   - **Not Support English or Arabic.**
   - **Russian Only: vosk-tts**
   - **Trying to alter Russian Vosk TTS Library To Use It in English and Arabic IF I Can:**
      - #### **Under Researched !!!!!!!**
2. **Performance (Manually)**
   - **.......**
3. **Online vs Offline**
   - **Offline**
4. **Response Time**
   -  **......**
5. **Cost**
   - **Free**

**English**

In [None]:
model = vosk.Model("C:\\Users\\maham\\.cache\\vosk\\vosk-model-en-us-0.22")

In [None]:
model = Model(model_name="C:\\Users\\maham\\.cache\\vosk\\vosk-model-en-us-0.22")
synth = Synth(model)

**Arabic**

---------------------------------------

### **5. Watson TTS**

1. **English & Arabic Support:**
   - **Support English Only**
2. **Performance (Manually):**
   - **Very Good**
3. **Online vs Offline:**
   - **Online**
4. **Response Time**
   -  **11s**
5. **Cost**
   - **Not Free**

In [None]:
key = 'key'
url = 'url'

In [None]:
authenticator = IAMAuthenticator(key)
tts = TextToSpeechV1(authenticator=authenticator)
tts.set_service_url(url)

**English**

In [None]:
start = time.time()

with open('text.mp3', 'wb') as audio_file:
    res = tts.synthesize(text_en, accept='audio/mp3', voice='en-US_AllisonV3Voice').get_result()
    audio_file.write(res.content)
os.system('text.mp3')

end = time.time()

exec_time = end - start
exec_time

9.756030797958374

In [None]:
Audio("text.mp3", rate=16000)

**Arabic**
- There is no arabic model

In [None]:
# 'ar-MS_OmarVoice' , 'ar-MS_LailaVoice'
with open('text.mp3', 'wb') as audio_file:
    res = tts.synthesize(text_ar, accept='audio/mp3', voice='ar-MS_LailaVoice').get_result()
    audio_file.write(res.content)
os.system('text.mp3')

In [None]:
import json
voices = tts.list_voices().get_result()
print(json.dumps(voices, indent=2))

{
  "voices": [
    {
      "name": "en-US_LisaExpressive",
      "language": "en-US",
      "gender": "female",
      "description": "Lisa: American English female voice. Dnn E2E technology with conversational and expressive capabilities.",
      "customizable": true,
      "supported_features": {
        "custom_pronunciation": true,
        "voice_transformation": false
      },
      "url": "https://api.au-syd.text-to-speech.watson.cloud.ibm.com/instances/690f160a-7989-45b0-8628-7d826ed35c83/v1/voices/en-US_LisaExpressive"
    },
    {
      "name": "en-US_AllisonVoice",
      "language": "en-US",
      "gender": "female",
      "description": "Allison: American English female voice.",
      "customizable": true,
      "supported_features": {
        "custom_pronunciation": true,
        "voice_transformation": true
      },
      "url": "https://api.au-syd.text-to-speech.watson.cloud.ibm.com/instances/690f160a-7989-45b0-8628-7d826ed35c83/v1/voices/en-US_AllisonVoice"
    },
    {


------------------------------

### **6. MMS TTS Arabic**
- https://huggingface.co/facebook/mms-tts-ara

1. **English & Arabic Support**
   - **Arabic Only**
2. **Performance (Manually)**
   - **Very Bad**
3. **Online vs Offline**
   - **Offline**
4. **Response Time**
   -  **17s**
5. **Cost**
   - **Not Free**

In [None]:
start = time.time()

model = VitsModel.from_pretrained("facebook/mms-tts-ara")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ara")
inputs = tokenizer(text_ar, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs).waveform

end = time.time()
exec_time = end - start
exec_time

Some weights of the model checkpoint at facebook/mms-tts-ara were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.1.wavenet.in_layers.1.wei

11.595572710037231

In [None]:
Audio(output, rate=model.config.sampling_rate)

----------------------------------------

### **7. seamless-m4t-v2-large**
- https://huggingface.co/audo/seamless-m4t-v2-large

- **SeamlessM4T models support the tasks of:**
    - **Speech-to-speech translation (S2ST)**
    - **Speech-to-text translation (S2TTv)**
    - **Text-to-speech translation (T2ST)**
    - **Text-to-text translation (T2TT)**
    - **Automatic speech recognition (ASR)**

- **But It's a translation model, So I tried to use it as TTS/Translation Model from language to the same language.**
- **Some of Drawbacks:**
    - **When using arabic language, It reads the numbers like dates (2024) wrong and in english**
    - **Not Good Enough in translation between English and Arabic**

1. **English & Arabic Support**
   - **English and Arabic**
2. **Performance (Manually)**
   - **Not Good**
3. **Online vs Offline**
   - **Offline**
4. **Response Time**
   -  **Very High: 55**
5. **Cost**
   - **Free**

In [6]:
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.17M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/2.72k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/211k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.24G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/9.91M [00:00<?, ?B/s]

**English**

In [8]:
start =  time.time()

text_inputs = processor(text = text_en, src_lang="eng", return_tensors="pt")
audio_array_from_text = model.generate(**text_inputs, tgt_lang="eng")[0].cpu().numpy().squeeze()

end = time.time()
exec_time = end - start
exec_time

55.61756491661072

In [11]:
Audio(audio_array_from_text, rate=16000)

**Arabic**

In [13]:
start =  time.time()

text_inputs = processor(text = text_ar, src_lang="arb", return_tensors="pt")
audio_array_from_text = model.generate(**text_inputs, tgt_lang="arb")[0].cpu().numpy().squeeze()

end = time.time()
exec_time = end - start
exec_time

62.702211141586304

In [14]:
Audio(audio_array_from_text, rate=16000)

---------------------

### **8. Others**

### **Azure TTS**
- https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-text-to-speech?tabs=windows%2Cterminal&pivots=programming-language-python

In [None]:
from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer

def text_to_speech_azure(text):
    speech_config = SpeechConfig(subscription="key", region="region")
    synthesizer = SpeechSynthesizer(speech_config=speech_config)
    result = synthesizer.speak_text_async(text).get()

text_to_speech_azure(text_en)

--------------------------------------

### **Realtime TTS**

- https://github.com/KoljaB/RealtimeTTS

In [None]:
from RealtimeTTS import TextToAudioStream, SystemEngine

engine = SystemEngine()
stream = TextToAudioStream(engine)
stream.feed(text_en)
stream.play_async()

---------------------------------------

### **Conclusion**
- **I think Google Text To Speech (gtts) is the best one (Library) for now with taking into consideration all selection criteria.**