## Text to Speech Save Audio File & Merge Audio File

Development Envrionment
<br><br>Microsoft Speech API (SAPI)
* Multiple Audio File
* Merge Audio File

<br>Google Text-to-Speech (gTTS)
* Mircrosoft Word 
* Novel Harry Potter
* Poem 杜甫 <江村>

<br>Google Cloud Text to Speech
<br>Naver CLOVA Voice

### Development Envrionment

In [None]:
%pip install gTTS
%pip install pydub
%pip install comtypes
%pip install pypiwin32
%pip install docx2txt
%pip install --upgrade google-cloud-texttospeech

In [74]:
import os
import sys
import time
import html
import docx2txt
import urllib.request
import win32com.client
import comtypes.client
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio 
from google.cloud import texttospeech
from nltk.tokenize import sent_tokenize, word_tokenize

### Microsoft Speech API (SAPI)

#### Multiple Audio File

In [None]:
speaker = win32com.client.Dispatch("SAPI.SpVoice")
speaker.Speak("Hello World")

In [103]:
speaker = win32com.client.Dispatch("SAPI.SpVoice")
filestream = win32com.client.Dispatch("SAPI.SpFileStream")

def speak(phrase):
    speaker.speak(phrase)
    
def save_voice(phrase):
    filestream.open(".".join([phrase, "mp3"]), 3, False)
    speaker.AudioOutputStream = filestream
    speaker.speak(phrase)
    filestream.close()

In [104]:
phrases = [
    "one",
    "two",
    "three"
]

for phrase in phrases:
    save_voice(phrase)

In [105]:
for phrase in phrases:
    tts_file = phrase  + ".mp3"
    display(Audio(tts_file, autoplay=True))

#### Merge Audio File

In [None]:
silent_gap = AudioSegment.silent(duration=10)

for idx, phrase in enumerate(phrases):
    tts_file = phrase  + ".mp3"
    if idx == 0:
        sound = AudioSegment.from_wav(tts_file)
    if idx > 0:
        sound += silent_gap
        sound += AudioSegment.from_wav(tts_file)        

merge_audio_file = "one_two_three.mp3"
sound.export(merge_audio_file, format="mp3")

In [112]:
display(Audio(merge_audio_file, autoplay=True))

### Google Text-to-Speech (gTTS)

#### Mircrosoft Word 

In [87]:
en_file = "hello.docx"
en_text = docx2txt.process("english.docx")
fr_file = "bonjour.docx"
fr_text = docx2txt.process("francais.docx")

tts_en = gTTS(en_text, lang='en', slow=False)
tts_fr = gTTS(fr_text, lang='fr', slow=False) 
tts_file = 'english_francais.mp3'

with open(tts_file, 'wb') as f:
    tts_en.write_to_fp(f)
    tts_fr.write_to_fp(f)

In [88]:
display(Audio(tts_file, autoplay=True))

#### Novel <Harry Potter and the Sorcerer's Stone>

In [94]:
en_file = "harry_potter_and_the_sorcerers_stone_1.txt"
with open(en_file, "r") as f:
    en_text = f.read()
en_list = sent_tokenize(en_text)

tts_file = 'harry_potter_and_the_sorcerers_stone_1.mp3'

with open(tts_file, 'wb') as f:
    for en in en_list:
        tts_en = gTTS(en, lang='en', slow=False)
        tts_en.write_to_fp(f) 

In [95]:
display(Audio(tts_file, autoplay=True))

#### Poem 杜甫 <江村>

In [92]:
ch_file = "dufu_the_riverside_village.txt"
with open(ch_file, "r") as f:
    ch_text = f.read()
ch_list = ch_text.split("\n")

tts_file = 'dufu_the_riverside_village.mp3'

with open(tts_file, 'wb') as f:
    for ch in ch_list:
        tts_ch = gTTS(ch, lang='zh-cn', slow=False)
        tts_ch.write_to_fp(f) 

In [93]:
display(Audio(tts_file, autoplay=True))

### Google Cloud Text to Speech

In [None]:
def ssml_to_audio(ssml_text, outfile):
    # Instantiates a client
    client = texttospeech.TextToSpeechClient()

    # Sets the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)

    # Builds the voice request, selects the language code ("en-US") and
    # the SSML voice gender ("MALE")
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
    )

    # Selects the type of audio file to return
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    # Performs the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # Writes the synthetic audio to the output file.
    with open(outfile, "wb") as out:
        out.write(response.audio_content)
        print("Audio content written to file " + outfile)

def text_to_ssml(inputfile):

    raw_lines = inputfile

    # Replace special characters with HTML Ampersand Character Codes
    # These Codes prevent the API from confusing text with
    # SSML commands
    # For example, '<' --> '&lt;' and '&' --> '&amp;'

    escaped_lines = html.escape(raw_lines)

    # Convert plaintext to SSML
    # Wait two seconds between each address
    ssml = "<speak>{}</speak>".format(
        escaped_lines.replace("\n", '\n<break time="2s"/>')
    )

    # Return the concatenated string of ssml script
    return ssml



text = """Here are <say-as interpret-as="characters">SSML</say-as> samples.
  I can pause <break time="3s"/>.
  I can play a sound"""

ssml = text_to_ssml(text)
ssml_to_audio(ssml, "test.mp3")

### Naver CLOVA Voice

In [None]:
client_id = "your_client_id" 
client_secret = "your_client_secret" 

speaker = "jinho"
speed = 0 

text = urllib.parse.quote("안녕")
data = "speaker=" + speaker + "&speed=" + str(speed) + "&text=" + text

url = "https://naveropenapi.apigw.ntruss.com/voice/v1/tts"
request = urllib.request.Request(url)

request.add_header("X-NCP-APIGW-API-KEY-ID", client_id)
request.add_header("X-NCP-APIGW-API-KEY", client_secret)

response = urllib.request.urlopen(request, data=data.encode('utf-8'))

if response.getcode() == 200:
    
    now = time.localtime()
    response_body = response.read()
    file_name = "%04d%02d%02d_%02d%02d%02d" % \
                (now.tm_year, now.tm_mon, now.tm_mday,
                 now.tm_hour, now.tm_min, now.tm_sec) + "_" + \
                speaker + "_" + str(speed) + "_"
    
    with open(file_name + ".mp3", 'wb') as f:
        f.write(response_body)

### Reference

<b>Official Site</b>
<br>[Google CloudSet up Application Default Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to)
<br>[Naver CLOVA Voice](https://www.ncloud.com/product/aiService/clovaVoice)

<br><br><b>Stackoverflow</b>
<br>Simon Peverett
<br>[Saving Text to Speech Python](https://stackoverflow.com/questions/49871252/saving-text-to-speech-python)
<br><br>Jiaaro
<br>[Python library to split and join mp3 files](https://stackoverflow.com/questions/2952309/python-library-to-split-and-join-mp3-files)

<br><br><b>Wikidocs</b>
<br>[공학자를 위한 Python 6.3 음성합성](https://wikidocs.net/15213)

<br><br><b>Github</b>
<br>dev-sngwn
<br>[텍스트를 음성 mp3로 간단하게 변환하기 (With Naver Cloud Platform)](https://dev-sngwn.github.io/2020-02-16-tts-step-by-step/)
<br><br>jungwoon
<br>[Google Application Default Credentials 사용하기](https://jungwoon.github.io/google%20cloud/2018/01/11/Google-Application-Default-Credential.html)