In [1]:
from dotenv import load_dotenv
import os 

load_dotenv() 

key = os.getenv('key')
endpoint = os.getenv('endpoint')
region = os.getenv('region')

print(f"Endpoint: {endpoint}")
print(f"Region: {region}")

Endpoint: https://magcogserviceseastus.cognitiveservices.azure.com/
Region: eastus


In [2]:
import azure.cognitiveservices.speech as speechsdk

In [3]:
from IPython.display import Audio, display

## Integrate personal voice in your application
You need to use speech synthesis markup language (SSML) to use personal voice in your application. SSML is an XML-based markup language that provides a standard way to mark up text for the generation of synthetic speech. SSML tags are used to control the pronunciation, volume, pitch, rate, and other attributes of the speech synthesis output.

* The `speakerProfileId` property in SSML is used to specify the speaker profile ID for the personal voice.

* The voice name is specified in the `name` property in SSML. For personal voice, the voice name must be one of the supported base model voice names. To get a list of supported base model voice names, use the `BaseModels_List` operation of the custom voice API.


    The voice names labeled with the `Latest`, such as `DragonLatestNeural` or `PhoenixLatestNeural`, will be updated from time to time; its performance may vary with updates for ongoing improvements. If you would like to use a fixed version, select one labeled with a version number, such as PhoenixV2Neural.


* `DragonLatestNeural` is a base model with superior voice cloning similarity compared to `PhoenixLatestNeural`. `PhoenixLatestNeural` is a base model with more accurate pronunciation and lower latency than DragonLatestNeural.

* For personal voice, you can use the `<lang xml:lang>` element to adjust the speaking language. It's the same as with multilingual voices. See how to use the lang element to speak different languages.

ref: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/personal-voice-how-to-use#integrate-personal-voice-in-your-application 

In [4]:
speaker_profile_id = "78eb9fcd-f9d8-4064-a24e-34d414a40f89"

# xml_ssml = f"""
# <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='en-US'>
#     <voice name='DragonLatestNeural'> 
#         <mstts:ttsembedding speakerProfileId='{speaker_profile_id}'> 
#             I'm happy to hear that you find me amazing and that I have made your trip planning easier and more fun. 
#             <lang xml:lang='zh-HK'>我很高興聽到你覺得我很了不起，我讓你的旅行計劃更輕鬆、更有趣。</lang>
#             <lang xml:lang='pl-PL'>Cieszę się, że uważasz, że jestem niesamowity i że ułatwiłem Ci planowanie podróży i sprawiłem, że jest to bardziej przyjemne.</lang>
#             <lang xml:lang='ar-jo'>أنا سعيد لسماع أنك تجدني رائعًا وأنني جعلت تخطيط رحلتك أسهل وأكثر متعة.</lang>
#             <lang xml:lang='he-IL'>אני שמח לשמוע שאתה מוצא אותי מדהים ושעשיתי את תכנון הטיול שלך קל ומהנה יותר.</lang>
#         </mstts:ttsembedding> 
#     </voice> 
# </speak>
# """



xml_ssml = f"""
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='en-US'>
    <voice name='DragonLatestNeural'> 
        <mstts:ttsembedding speakerProfileId='{speaker_profile_id}'> 

            <lang xml:lang='pl-PL'>Cieszę się, że uważasz, że jestem niesamowity i że ułatwiłem Ci planowanie podróży i sprawiłem, że jest to bardziej przyjemne.</lang>

        </mstts:ttsembedding> 
    </voice> 
</speak>
"""
print(xml_ssml)



<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='en-US'>
    <voice name='DragonLatestNeural'> 
        <mstts:ttsembedding speakerProfileId='78eb9fcd-f9d8-4064-a24e-34d414a40f89'> 

            <lang xml:lang='pl-PL'>Cieszę się, że uważasz, że jestem niesamowity i że ułatwiłem Ci planowanie podróży i sprawiłem, że jest to bardziej przyjemne.</lang>

        </mstts:ttsembedding> 
    </voice> 
</speak>



In [5]:
text_en = "I'm happy to hear that you find me amazing and that I have made your trip planning easier and more fun."
text_zh = "你覺得我很了不起，我讓你的旅行計劃更輕鬆、更有趣。"    
ssml = f"""<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis'
xmlns:mstts='http://www.w3.org/2001/mstts'>
<voice name='DragonLatestNeural'>
<mstts:ttsembedding speakerProfileId='{speaker_profile_id}'/>
<lang xml:lang='en-US'> {text_en} </lang>
</voice></speak> 
"""
# ssml = f"""<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis'
# xmlns:mstts='http://www.w3.org/2001/mstts' >
# <voice name='DragonLatestNeural'>
# <mstts:ttsembedding speakerProfileId='{speaker_profile_id}'/> {text_en}
# </voice></speak> 
# """

output_file = "..\\wav\\output.wav"

print(ssml)

<speak version='1.0' xml:lang='en-US' xmlns='http://www.w3.org/2001/10/synthesis'
xmlns:mstts='http://www.w3.org/2001/mstts'>
<voice name='DragonLatestNeural'>
<mstts:ttsembedding speakerProfileId='78eb9fcd-f9d8-4064-a24e-34d414a40f89'/>
<lang xml:lang='en-US'> I'm happy to hear that you find me amazing and that I have made your trip planning easier and more fun. </lang>
</voice></speak> 



In [6]:
# Creates an instance of a speech config with specified subscription key and service region.
speech_config = speechsdk.SpeechConfig(subscription=key, region=region)
# speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm)
# audio_config = speechsdk.audio.AudioOutputConfig(filename="..\\wav\\output.wav")
audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

In [7]:
def word_boundary(evt):
    print(f"Word Boundary: Text='{evt.text}', Audio offset={evt.audio_offset / 10000}ms,Duration={evt.duration / 10000}ms,text={evt.text}")

speech_synthesizer.synthesis_word_boundary.connect(word_boundary)



In [8]:
#result = speech_synthesizer.speak_ssml_async(ssml).get()
# result = speech_synthesizer.speak_ssml(ssml)
result = speech_synthesizer.speak_ssml(xml_ssml)


# result = speech_synthesizer.speak_text("hello! ")


In [17]:
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized for text [{}], and the audio was saved to [{}]".format(text_en, output_file))
    print("result id: {}".format(result.result_id))
    display(Audio(data=result.audio_data)) #, rate=16000, autoplay=True))
elif result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))
        print("result id: {}".format(result.result_id))


Speech synthesized for text [I'm happy to hear that you find me amazing and that I have made your trip planning easier and more fun.], and the audio was saved to [..\wav\output.wav]
result id: 1b3508fc733a4b64bf286383babf9fa6


In [20]:
print(result.reason)

ResultReason.Canceled
