# Old Text-to-Speech (TTS) Systems

In [1]:
import pyttsx3

# Initialize
engine = pyttsx3.init()

# Configure voice properties
engine.setProperty('rate', 150)    # Speed
engine.setProperty('volume', 0.9)  # Volume
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[1].id)  # Female voice

# Speak
text = "Hello! I am a computer speaking to you."
engine.say(text)
engine.runAndWait()

# Try different rates, volumes, voices!

# New Text-to-Speech (TTS) Systems

In [2]:
from openai import OpenAI
from pydub import AudioSegment
from pydub.playback import play
import io
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Generate speech
response = client.audio.speech.create(
    model="tts-1",  # or "tts-1-hd" for higher quality
    voice="alloy",  # nova, echo, fable, onyx, shimmer
    input="Hello! This is OpenAI's text to speech."
)

# Play audio directly
audio = AudioSegment.from_mp3(io.BytesIO(response.content))
play(audio)

Input #0, wav, from '/tmp/tmp1xaus_u0.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:02.42, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   2.23 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   




   2.35 M-A: -0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   

# What are the available voices?

In [3]:
from openai import OpenAI
from pydub import AudioSegment
from pydub.playback import play
import io
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

voices = ["alloy", "echo", "fable", "nova", "onyx", "shimmer"]
text = "The quick brown fox jumps over the lazy dog. How does my voice sound?"

for voice in voices:
    response = client.audio.speech.create(
        model="tts-1-hd",
        voice=voice,
        input=text
    )
    
    print(f"Playing: {voice}")
    
    # Play audio directly
    audio = AudioSegment.from_mp3(io.BytesIO(response.content))
    play(audio)
    
    print(f"Finished: {voice}")

Playing: alloy


Input #0, wav, from '/tmp/tmprv3c_rw_.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:04.27, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   4.23 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   


Finished: alloy
Playing: echo


Input #0, wav, from '/tmp/tmp0e4qjooh.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:04.30, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   4.21 M-A: -0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   


Finished: echo
Playing: fable


Input #0, wav, from '/tmp/tmpxjidni9u.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:04.34, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   4.23 M-A: -0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   


Finished: fable
Playing: nova


Input #0, wav, from '/tmp/tmp60bkxejp.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:04.10, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   4.04 M-A: -0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   


Finished: nova
Playing: onyx


Input #0, wav, from '/tmp/tmpz0dvsq01.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:04.27, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   4.22 M-A: -0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   


Finished: onyx
Playing: shimmer


Input #0, wav, from '/tmp/tmppje5tt9y.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:04.37, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   4.29 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   


Finished: shimmer


   4.32 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   

# Eleven Labs

In [None]:
from pydub import AudioSegment
from io import BytesIO

from elevenlabs import ElevenLabs

from dotenv import load_dotenv
load_dotenv()

client = ElevenLabs(api_key=os.getenv("ELEVEN_LABS_KEY"))

# Generate with pre-made voice (Rachel)
audio_generator = client.text_to_speech.convert(
    text="ElevenLabs creates the most realistic voices",
    voice_id="21m00Tcm4TlvDq8ikWAM",  # Rachel
    model_id="eleven_multilingual_v2",
    output_format="mp3_44100_128",
)

# Convert generator to bytes
audio_bytes = b"".join(audio_generator)

# Create AudioSegment and play
audio = AudioSegment.from_mp3(BytesIO(audio_bytes))
play(audio)



audio_generator = client.text_to_speech.convert(
    text="Fine-tuned emotional speech",
    voice_id="21m00Tcm4TlvDq8ikWAM",  # Rachel
    model_id="eleven_multilingual_v2",
    output_format="mp3_44100_128",
    voice_settings=VoiceSettings(
        stability=0.75,
        similarity_boost=0.8,
        style=0.2,
        use_speaker_boost=True
    )
)

# Convert generator to bytes
audio_bytes = b"".join(audio_generator)

# Create AudioSegment and play
audio = AudioSegment.from_mp3(BytesIO(audio_bytes))
play(audio)

Input #0, wav, from '/tmp/tmp1xg8crj0.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:02.93, bitrate: 705 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 44100 Hz, 1 channels, s16, 705 kb/s
   1.83 M-A:  0.000 fd=   0 aq=   52KB vq=    0KB sq=    0B f=0/0   