Install the dependent libraries. This page is inspired from the link 
https://www.marktechpost.com/2025/04/12/step-by-step-guide-on-converting-text-to-high-quality-audio-using-an-open-source-tts-model-on-hugging-face-including-detailed-audio-file-analysis-and-diagnostic-tools-in-python/

In [None]:
!pip install TTS --trusted-host pypi.org --trusted-host files.pythonhosted.org

In [27]:
from TTS.utils.synthesizer import Synthesizer

In [28]:
from TTS.api import TTS
import contextlib
import wave

In [None]:
!pip list

In [29]:
def text_to_speech(text: str, output_path: str = "output.wav", use_gpu: bool = False):
    """
    Converts input text to speech and saves the result to an audio file.


    Parameters:
        text (str): The text to convert.
        output_path (str): Output WAV file path.
        use_gpu (bool): Use GPU for inference if available.
    """
    model_name = "tts_models/en/ljspeech/tacotron2-DDC"
   
    tts = TTS(model_name=model_name, progress_bar=True, gpu=use_gpu)
   
    tts.tts_to_file(text=text, file_path=output_path)
    print(f"Audio file generated successfully: {output_path}")

In [30]:
def analyze_audio(file_path: str):
    """
    Analyzes the WAV audio file and prints details about it.
   
    Parameters:
        file_path (str): The path to the WAV audio file.
    """
    with contextlib.closing(wave.open(file_path, 'rb')) as wf:
        frames = wf.getnframes()
        rate = wf.getframerate()
        duration = frames / float(rate)
        sample_width = wf.getsampwidth()
        channels = wf.getnchannels()
   
    print("\nAudio Analysis:")
    print(f" - Duration      : {duration:.2f} seconds")
    print(f" - Frame Rate    : {rate} frames per second")
    print(f" - Sample Width  : {sample_width} bytes")
    print(f" - Channels      : {channels}")

In [31]:
if __name__ == "__main__":
    sample_text = (
        "Marktechpost is an AI News Platform providing easy-to-consume, byte size updates in machine learning, deep learning, and data science research. Our vision is to showcase the hottest research trends in AI from around the world using our innovative method of search and discovery"
    )
   
    output_file = "output.wav"
    text_to_speech(sample_text, output_path=output_file)
   
    analyze_audio(output_file)

 > Downloading model to /Users/in22898717/Library/Application Support/tts/tts_models--en--ljspeech--tacotron2-DDC


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 113M/113M [03:02<00:00, 565kiB/s]

 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Downloading model to /Users/in22898717/Library/Application Support/tts/vocoder_models--en--ljspeech--hifigan_v2



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 113M/113M [03:11<00:00, 587kiB/s][A

  0%|▎                                                                                                                                                                                                              | 6.14k/3.80M [00:00<01:30, 42.0kiB/s][A
  1%|█▊                                                                                                                                                                                                              | 33.8k/3.80M [00:00<00:28, 130kiB/s][A
  2%|███▋                                                                                                                                                                                                            | 66.6k/3.80M [00:00<00

 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.80M/3.80M [00:18<00:00, 655kiB/s][A