In [37]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the variables
key = os.getenv('key')
endpoint = os.getenv('endpoint')
region = os.getenv('region')

In [38]:
import requests
import json
import uuid

In [39]:
import sounddevice as sd
from scipy.io import wavfile

In [40]:
from IPython.display import Audio, display

In [41]:
project_id = "test_project"
headers = {
    'Ocp-Apim-Subscription-Key': key
    }



# Create Project

In [42]:
# Define the URL for the PUT request
put_url = f"{endpoint}/customvoice/projects/{project_id}?api-version=2024-02-01-preview"

# Define the payload for the PUT request
put_payload = {
    'description': 'Updated project description',
    'kind': 'PersonalVoice',
    'id': project_id,
    'displayName': 'Updated Test personal voice'
}

# Make the PUT request
put_response = requests.put(put_url, headers=headers, json=put_payload)

# Print the response
print(put_response.status_code)
print(json.dumps(put_response.json(), indent=4))

200
{
    "kind": "PersonalVoice",
    "id": "test_project",
    "displayName": "Updated Test personal voice",
    "description": "Updated project description",
    "createdDateTime": "2025-01-06T03:39:04.310Z"
}


## List Projects

In [43]:
# print(headers)
# Define the URL for the GET request
projects_url = f"{endpoint}/customvoice/projects?api-version=2024-02-01-preview"

# Make the GET request
projects_response = requests.get(projects_url, headers=headers)

# Print the response
print(projects_response.status_code)
print(json.dumps(projects_response.json(), indent=4))

200
{
    "value": [
        {
            "kind": "PersonalVoice",
            "id": "test_project",
            "displayName": "Updated Test personal voice",
            "description": "Updated project description",
            "createdDateTime": "2025-01-06T03:39:04.310Z"
        },
        {
            "kind": "PersonalVoice",
            "id": "14626eba-a659-4491-b80c-e067116e5592",
            "displayName": "Default Trial Project - Kq5dfXs9Ab",
            "description": "The default trial project for all trial resources.",
            "createdDateTime": "2024-03-01T00:47:04.673Z"
        }
    ]
}


## Delete a project

In [None]:
delete_project_url = f"{endpoint}/customvoice/projects/{project_id}?api-version=2024-02-01-preview"
# Make the DELETE request
delete_response = requests.delete(delete_project_url, headers=headers)

# Print the response
print(delete_response.status_code)
print(delete_response.text)

## Get a project


In [44]:
# Define the URL for the GET request to get project details
get_project_url = f"{endpoint}/customvoice/projects/{project_id}?api-version=2024-02-01-preview"

# Make the GET request
get_project_response = requests.get(get_project_url, headers=headers)

# Print the response
print(get_project_response.status_code)
print(json.dumps(get_project_response.json(), indent=4))

200
{
    "kind": "PersonalVoice",
    "id": "test_project",
    "displayName": "Updated Test personal voice",
    "description": "Updated project description",
    "createdDateTime": "2025-01-06T03:39:04.310Z"
}


# Recording Helper


In [48]:
import time

def record_and_save(statement, statement_file_name, sample_rate=44100, max_duration=15):
    print("Speak following script clearly:\n")
    print(statement)
    
    # Start the timer
    start_time = time.time()
    

    input_device = sd.query_devices(kind='input')
    input_channels = input_device['max_input_channels']
    print("input device: ", input_device['name'], ' channels: ', input_channels)
    recording = sd.rec(int(max_duration * sample_rate), samplerate=sample_rate, channels=input_channels, dtype='int16')

    while True:
        try:
            if input() == "":
                sd.stop()
                break
        except KeyboardInterrupt:
            sd.stop()
            break

    # Stop the timer
    end_time = time.time()
    recording_duration = end_time - start_time
    print(f"Recording finished. Duration: {recording_duration:.2f} seconds")

    # Trim the recording to the actual duration
    recording = recording[:int(recording_duration * sample_rate)]

    # Replay the recorded audio
    print("Replaying the recorded message...")
    sd.play(recording, samplerate=sample_rate)
    sd.wait()  # Wait until the playback is finished
    print("Playback finished.")

    # Save the recording to a WAV file
    wavfile.write(statement_file_name, sample_rate, recording)
    print(f"Recording saved to {statement_file_name}")


In [51]:


def play_audio_from_file(file_name):
    # Load the audio file
    samplerate, data= wavfile.read(file_name)
    print(data, samplerate)
    channels = 1 if len(data.shape) == 1 else data.shape[1]
    print(f"number of channels = {channels}")  
    length = data.shape[0] / samplerate
    print(f"length = {length:.2f}s")

    # # Play the audio file
    # sd.play(data, samplerate)
    # sd.wait()  # Wait until the playback is finished
    display(Audio(data=data, rate=samplerate))  

In [52]:
record_and_save("Hello, this is a test recording.", "test_recording.wav")

Speak following script clearly:

Hello, this is a test recording.
input device:  Headset Microphone (Plantronics  channels:  1
Recording finished. Duration: 4.11 seconds
Replaying the recorded message...
Playback finished.
Recording saved to test_recording.wav


In [53]:
play_audio_from_file("test_recording.wav")

[ -6 -19 -33 ...   0   0   0] 44100
number of channels = 1
length = 4.11s


# Add User Consent 

https://learn.microsoft.com/en-us/azure/ai-services/speech-service/personal-voice-create-consent

Consent Statement: 

"I *[state your first and last name]* am aware that recordings of my voice will be used by *[state the name of the company]* to create and use a synthetic version of my voice."


## Collect information

In [54]:
first_name = input("Please enter your first name: ") or "Mutaz"
last_name = input("Please enter your last name: ") or "Abu Ghazaleh"
company_name = input("Please enter the name of the company: ") or "Microsoft"

consent_statement = f"I {first_name} {last_name} am aware that recordings of my voice will be used by {company_name} to create and use a synthetic version of my voice."
print("Consent Statement:\n")
print(consent_statement)

Consent Statement:

I Mutaz Abu Ghazaleh am aware that recordings of my voice will be used by Microsoft to create and use a synthetic version of my voice.


In [73]:
voice_talent_name = f"{first_name} {last_name}"
voice_talent_consent_id = f"{voice_talent_name}_{uuid.uuid4()}".replace(" ","").replace("-","")
speaker_profile_id = f"{voice_talent_consent_id}_profile"

print(voice_talent_name)
print(voice_talent_consent_id)
print(speaker_profile_id)



Mutaz Abu Ghazaleh
MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353
MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353_profile


## Record Consent

In [56]:

# Define the sample rate and file name
sample_rate = 44100  # Sample rate in Hz
max_duration = 30
file_name = "consent_message.wav"


record_and_save(consent_statement, file_name, sample_rate, max_duration)


Speak following script clearly:

I Mutaz Abu Ghazaleh am aware that recordings of my voice will be used by Microsoft to create and use a synthetic version of my voice.
input device:  Headset Microphone (Plantronics  channels:  1
Recording finished. Duration: 13.10 seconds
Replaying the recorded message...
Playback finished.
Recording saved to consent_message.wav


In [57]:
play_audio_from_file(file_name)

[-2 -2 -4 ...  0  0  0] 44100
number of channels = 1
length = 13.10s


## Upload consent audio and create a voice talent

In [74]:

# Define the URL for the POST request
consent_url = f"{endpoint}/customvoice/consents/{voice_talent_consent_id}?api-version=2024-02-01-preview"

# Define the payload for the POST request
consent_payload = {
    'description': f'Consent for {voice_talent_name}',
    'projectId': project_id,
    'voiceTalentName': f'{voice_talent_name}',
    'companyName': company_name,
    'locale': 'en-US'
}

# Define the files for the POST request
files = {
    'audiodata': (file_name, open(file_name, 'rb'), 'audio/wav')
}

# Make the POST request
consent_response = requests.post(consent_url, headers=headers, data=consent_payload, files=files)

# Print the response
print(consent_response.status_code)
print(json.dumps(consent_response.json(), indent=4))

201
{
    "status": "NotStarted",
    "lastActionDateTime": "2025-01-06T12:15:15.317Z",
    "id": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353",
    "displayName": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353",
    "description": "Consent for Mutaz Abu Ghazaleh",
    "createdDateTime": "2025-01-06T12:15:15.317Z",
    "voiceTalentName": "Mutaz Abu Ghazaleh",
    "companyName": "Microsoft",
    "locale": "en-US",
    "projectId": "test_project"
}


## list voice talents (consents)

In [77]:
# Define the URL for the GET request to list consents
list_consents_url = f"{endpoint}/customvoice/consents?api-version=2024-02-01-preview"

# Make the GET request
list_consents_response = requests.get(list_consents_url, headers=headers)

# Print the response
print(list_consents_response.status_code)
print(json.dumps(list_consents_response.json(), indent=4))

200
{
    "value": [
        {
            "status": "Succeeded",
            "lastActionDateTime": "2025-01-06T12:15:18.547Z",
            "id": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353",
            "displayName": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353",
            "description": "Consent for Mutaz Abu Ghazaleh",
            "createdDateTime": "2025-01-06T12:15:15.317Z",
            "voiceTalentName": "Mutaz Abu Ghazaleh",
            "companyName": "Microsoft",
            "locale": "en-US",
            "projectId": "test_project"
        }
    ]
}


## delete a consent/voice talent

In [76]:
# Define the URL for the DELETE request to delete consent
voice_to_delete = "MutazAbuGhazaleh_94b138e0-a049-4bbc-8ea4-4b7c4227cd34"
delete_consent_url = f"{endpoint}/customvoice/consents/{voice_to_delete}?api-version=2024-02-01-preview"

# Make the DELETE request
delete_consent_response = requests.delete(delete_consent_url, headers=headers)

# Print the response
print(delete_consent_response.status_code)
print(delete_consent_response.text)

204



# Create a personal voice

https://learn.microsoft.com/en-us/azure/ai-services/speech-service/personal-voice-create-voice

## list personal voices already created

In [78]:
# Define the URL for the GET request to list personal voices
list_personal_voices_url = f"{endpoint}/customvoice/personalvoices?api-version=2024-02-01-preview"

# Make the GET request
list_personal_voices_response = requests.get(list_personal_voices_url, headers=headers)

# Print the response
print(list_personal_voices_response.status_code)
print(json.dumps(list_personal_voices_response.json(), indent=4))

200
{
    "value": []
}


## record voice using sample

In [65]:
training_statements = [
    f"Days turned into weeks, and soon, a small green sprout emerged from the soil.",
    f"The bridge is experiencing moderate traffic northbound, with delays of about 10 minutes.",
    f"Hello, this is your AI assistant! How can I help you today?"
]

for index, statement in enumerate(training_statements):
    print(f"{index}: {statement}")


0: Days turned into weeks, and soon, a small green sprout emerged from the soil.
1: The bridge is experiencing moderate traffic northbound, with delays of about 10 minutes.
2: Hello, this is your AI assistant! How can I help you today?


In [66]:
recorded_files = []
for index, statement in enumerate(training_statements):
    statement_file_name = f"recorded_statement_{index:02}.wav"
    print("statement: ", statement)
    record_and_save(statement, statement_file_name)
    recorded_files.append(statement_file_name)

statement:  Days turned into weeks, and soon, a small green sprout emerged from the soil.
Speak following script clearly:

Days turned into weeks, and soon, a small green sprout emerged from the soil.
input device:  Headset Microphone (Plantronics  channels:  1
Recording finished. Duration: 9.37 seconds
Replaying the recorded message...
Playback finished.
Recording saved to recorded_statement_00.wav
statement:  The bridge is experiencing moderate traffic northbound, with delays of about 10 minutes.
Speak following script clearly:

The bridge is experiencing moderate traffic northbound, with delays of about 10 minutes.
input device:  Headset Microphone (Plantronics  channels:  1
Recording finished. Duration: 8.26 seconds
Replaying the recorded message...
Playback finished.
Recording saved to recorded_statement_01.wav
statement:  Hello, this is your AI assistant! How can I help you today?
Speak following script clearly:

Hello, this is your AI assistant! How can I help you today?
input d

In [101]:
print(recorded_files[0])

recorded_statement_00.wav


## Create personal voice using recordings


In [79]:
# Define the URL for the POST request to create a personal voice
print("we already have:" )
print("project id: ", project_id)
print("consent id: ", voice_talent_consent_id)
print("\nnow we need to create: ")
print("speaker profile id: ", speaker_profile_id)

print("\ncreating personal voice...")


we already have:
project id:  test_project
consent id:  MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353

now we need to create: 
speaker profile id:  MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353_profile

creating personal voice...


In [83]:

create_personal_voice_url = f"{endpoint}/customvoice/personalvoices/{speaker_profile_id}?api-version=2024-02-01-preview"

# Define the payload for the POST request
personal_voice_payload = {
    'projectId': project_id,
    'consentId': voice_talent_consent_id
}

# Define the files for the POST request
personal_voice_files = {
    'audiodata': (recorded_files[0], open(file_name, 'rb'), 'audio/wav')
}

# Make the POST request
create_personal_voice_response = requests.post(
    create_personal_voice_url, 
    headers=headers, 
    data=personal_voice_payload, 
    files=personal_voice_files)

# Print the response
print(create_personal_voice_response.status_code)
print(json.dumps(create_personal_voice_response.json(), indent=4))

201
{
    "projectId": "test_project",
    "consentId": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353",
    "speakerProfileId": "a8e79c93-e52b-40b8-b500-4054a00b3ba6",
    "status": "NotStarted",
    "lastActionDateTime": "2025-01-06T12:17:05.053Z",
    "id": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353_profile",
    "displayName": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353_profile",
    "createdDateTime": "2025-01-06T12:17:05.053Z"
}


## list personal voices 

https://learn.microsoft.com/en-us/rest/api/aiservices/speechapi/personal-voices/list?view=rest-aiservices-speechapi-2024-02-01-preview&tabs=HTTP

In [84]:
# Define the URL for the GET request to list personal voices
list_personal_voices_url = f"{endpoint}/customvoice/personalvoices?api-version=2024-02-01-preview"

# Make the GET request
list_personal_voices_response = requests.get(list_personal_voices_url, headers=headers)

# Print the response
print(list_personal_voices_response.status_code)
print(json.dumps(list_personal_voices_response.json(), indent=4))

200
{
    "value": [
        {
            "projectId": "test_project",
            "consentId": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353",
            "speakerProfileId": "a8e79c93-e52b-40b8-b500-4054a00b3ba6",
            "status": "Succeeded",
            "lastActionDateTime": "2025-01-06T12:17:09.010Z",
            "id": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353_profile",
            "displayName": "MutazAbuGhazaleh_a684e444066043918f4e488cfaf16353_profile",
            "createdDateTime": "2025-01-06T12:17:05.053Z"
        }
    ]
}


In [91]:
json_response = list_personal_voices_response.json()
speaker_profile_id = json_response['value'][0]['speakerProfileId']

print(f"speaker profile id: {speaker_profile_id}")

speaker profile id: a8e79c93-e52b-40b8-b500-4054a00b3ba6


## Delete voice


In [14]:
# Define the URL for the DELETE request to delete a personal voice


id_to_delete = "profile_MutazAbuGhazalehId"
delete_personal_voice_url = f"{endpoint}/customvoice/personalvoices/{id_to_delete}?api-version=2024-02-01-preview"

# Make the DELETE request
delete_personal_voice_response = requests.delete(delete_personal_voice_url, headers=headers)

# Print the response
print(delete_personal_voice_response.status_code)
print(delete_personal_voice_response.text)

204



# Use Personal Voice


In [109]:
profile_id = f"profile_MutazAbuGhazalehId"

# Define the URL for the GET request to get a specific personal voice
get_personal_voice_url = f"{endpoint}/customvoice/personalvoices/{profile_id}?api-version=2024-02-01-preview"

# Make the GET request
get_personal_voice_response = requests.get(get_personal_voice_url, headers=headers)

# Print the response
print(get_personal_voice_response.status_code)
print(json.dumps(get_personal_voice_response.json(), indent=4))

200
{
    "projectId": "test_project",
    "consentId": "MutazAbuGhazalehId",
    "speakerProfileId": "1a9cd953-bc7b-42ac-8283-ce8903dce317",
    "status": "Succeeded",
    "lastActionDateTime": "2025-01-06T05:20:19.777Z",
    "id": "profile_MutazAbuGhazalehId",
    "displayName": "profile_MutazAbuGhazalehId",
    "createdDateTime": "2025-01-06T05:20:14.823Z"
}


In [111]:
xml_ssml = f"""
<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='en-US'>
    <voice name='DragonLatestNeural'> 
        <mstts:ttsembedding speakerProfileId='{profile_id}'> 
            I'm happy to hear that you find me amazing and that I have made your trip planning easier and more fun. 
            <lang xml:lang='zh-HK'>我很高興聽到你覺得我很了不起，我讓你的旅行計劃更輕鬆、更有趣。</lang>
        </mstts:ttsembedding> 
    </voice> 
</speak>
"""

In [112]:
print(xml_ssml)


<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang='en-US'>
    <voice name='DragonLatestNeural'> 
        <mstts:ttsembedding speakerProfileId='profile_MutazAbuGhazalehId'> 
            I'm happy to hear that you find me amazing and that I have made your trip planning easier and more fun. 
            <lang xml:lang='zh-HK'>我很高興聽到你覺得我很了不起，我讓你的旅行計劃更輕鬆、更有趣。</lang>
        </mstts:ttsembedding> 
    </voice> 
</speak>




You can use the SSML via the Speech SDK or REST API.

Real-time speech synthesis: Use the Speech SDK or REST API to convert text to speech.

When you use Speech SDK, don't set Endpoint ID, just like prebuild voice.

When you use REST API, please use prebuilt neural voices endpoint.


https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-text-to-speech

https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech 


base voice name is DragonLatestNeural  

speakerprofileid is the name of the personal voice model use lang xml:lang to adjust the language 
