In [None]:
import os
import time
import random
import tempfile
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
from gtts import gTTS
from TTS.api import TTS
import whisper
from jiwer import wer, cer, Compose, ToLowerCase, RemovePunctuation, RemoveMultipleSpaces, Strip
import torch
from transformers import VitsModel, AutoTokenizer

In [None]:
# Initialize TTS and ASR models
print("Loading TTS + ASR models...")
tts_coqui = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)  # Coqui TTS
asr = whisper.load_model("base")  # Whisper ASR
print("All models loaded successfully!")

print("Loading extra TTS models...")
mms_model_id = "facebook/mms-tts-eng"
mms_model = VitsModel.from_pretrained(mms_model_id)           # Meta MMS TTS model (VITS)
mms_tok   = AutoTokenizer.from_pretrained(mms_model_id)       # Corresponding tokenizer
mms_sr    = mms_model.config.sampling_rate                    # Sample rate config
print("Extra models loaded!")

Loading TTS + ASR models...
 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: h

Some weights of the model checkpoint at facebook/mms-tts-eng were not used when initializing VitsModel: ['flow.flows.3.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'posterior_encoder.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.res_skip_layers.15.weight_v', 'posterior_encoder.wavenet.in_layers.5.weight_g', 'posterior_encoder.wavenet.res_skip_layers.5.weight_g', 'flow.flows.2.wavenet.res_skip_layers.2.weight_v', 'flow.flows.2.wavenet.res_skip_layers.0.weight_v', 'posterior_encoder.wavenet.res_skip_layers.4.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.in_layers.14.weight_v', 'posterior_encoder.wavenet.in_layers.13.weight_v', 'posterior_encoder.wavenet.in_layers.2.weight_g', 'posterior_encoder.wavenet.in_layers.4.weight_v', 'posterior_encoder.wavenet.in_layers.9.weight_g', 'flow.flows.2.wavenet.in_layers.3.weight_v', 'flow.flows.3.wavenet.in_laye

Extra models loaded!


In [None]:
import pandas as pd
import random

# Load dataset
df = pd.read_csv("Text_To_Speech_Dataset.csv")

# Use all sentences instead of a random sample
samples = df.reset_index(drop=True)

samples.head()


Unnamed: 0,sentence
0,Better person.
1,Let's go outside to play.
2,An uncle is reading a story.
3,A cat hiding in a closet.
4,Cows on a farm.


In [11]:
# Helper function: ASR transcription with librosa
def transcribe_with_librosa(asr, file_path):
    """
    Load audio with librosa and transcribe it using Whisper ASR.
    Handles issues with file path encoding on Windows.
    """
    try:
        audio_data, sr = librosa.load(file_path, sr=16000, mono=True)
        result = asr.transcribe(audio_data, fp16=False, language='en')
        return result["text"].strip()
    except Exception as e:
        raise Exception(f"Librosa transcription failed: {str(e)}")


# Core evaluation function for a given TTS model
def evaluate_tts_model(tts_func, model_name, samples):
    """
    Run TTS on each sample, transcribe the result, and compute evaluation metrics.
    Returns a dictionary with WER, CER, accuracy, and more.
    """
    print(f"\nEvaluating {model_name}...")
    ground_truths, transcriptions = [], []

    for idx, row in tqdm(samples.iterrows(), total=len(samples), desc=model_name):
        sentence = row['sentence']
        audio_path = os.path.join(os.getcwd(), f"tts_eval_{model_name}_{idx}.wav")

        try:
            success = tts_func(sentence, audio_path)
            if not success:
                continue

            time.sleep(0.3)  # Ensure audio file is fully written

            # If file is valid, transcribe
            if os.path.exists(audio_path) and os.path.getsize(audio_path) > 1000:
                transcription = transcribe_with_librosa(asr, audio_path)
                ground_truths.append(sentence.lower())
                transcriptions.append(transcription.lower())
            else:
                print(f"Audio file issue for: {sentence}")
        except Exception as e:
            print(f"Failed for '{sentence[:30]}...': {str(e)}")
        finally:
            # Clean up generated audio file
            if os.path.exists(audio_path):
                try:
                    os.remove(audio_path)
                except:
                    pass

    # Compute metrics if we have valid data
    results = {}
    if ground_truths and transcriptions:
        exact_matches = sum(1 for gt, pred in zip(ground_truths, transcriptions) if gt == pred)

        normalize = Compose([ToLowerCase(), RemovePunctuation(), RemoveMultipleSpaces(), Strip()])
        norm_gt   = [normalize(gt) for gt in ground_truths]
        norm_pred = [normalize(pred) for pred in transcriptions]

        wer_score = wer(norm_gt, norm_pred)
        cer_score = cer(norm_gt, norm_pred)

        results = {
            'model': model_name,
            'samples_processed': len(ground_truths),
            'exact_matches': exact_matches,
            'exact_match_rate': exact_matches / len(ground_truths) * 100,
            'wer': wer_score,
            'cer': cer_score,
            'word_accuracy': (1 - wer_score) * 100,
            'char_accuracy': (1 - cer_score) * 100,
            'ground_truths': ground_truths,
            'transcriptions': transcriptions
        }

        # Print summary
        print(f"{model_name} Results:")
        print(f"   Samples processed: {len(ground_truths)}/{len(samples)}")
        print(f"   Exact matches: {exact_matches}/{len(ground_truths)} ({results['exact_match_rate']:.1f}%)")
        print(f"   WER: {wer_score:.4f} ({wer_score*100:.2f}%)")
        print(f"   CER: {cer_score:.4f} ({cer_score*100:.2f}%)")
        print(f"   Word Accuracy: {results['word_accuracy']:.2f}%")
        print(f"   Character Accuracy: {results['char_accuracy']:.2f}%")

    return results

In [None]:
# TTS wrapper functions for each model
def coqui_tts_func(text, output_path):
    """Generate TTS output using Coqui and save to file."""
    try:
        tts_coqui.tts_to_file(text=text, file_path=output_path)
        return True
    except:
        return False

def gtts_tts_func(text, output_path):
    """Generate TTS using Google TTS and save to file."""
    try:
        out_dir = os.path.dirname(output_path)
        os.makedirs(out_dir, exist_ok=True)
        gTTS(text=text, lang="en").save(output_path)
        return os.path.exists(output_path) and os.path.getsize(output_path) > 1000
    except Exception as e:
        print(f"gTTS error: {e}")
        return False

def mms_tts_func(text, output_path):
    """Generate TTS using Meta MMS-TTS (VITS) and save to file."""
    try:
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        inputs = mms_tok(text, return_tensors="pt")

        with torch.no_grad():
            wave = mms_model(**inputs).waveform.squeeze().cpu().numpy()

        # Normalize waveform volume
        if np.max(np.abs(wave)) > 0:
            wave = wave / np.max(np.abs(wave)) * 0.95
        wave = wave.astype("float32")

        sf.write(output_path, wave, samplerate=mms_sr, format="WAV", subtype="PCM_16")
        return os.path.exists(output_path) and os.path.getsize(output_path) > 1000
    except Exception as e:
        print(f"MMS-TTS error: {e}")
        return False

In [13]:
# Run evaluation for all 3 models
results = {}
results["Coqui"]    = evaluate_tts_model(coqui_tts_func, "Coqui TTS", samples)
results["gTTS"]     = evaluate_tts_model(gtts_tts_func, "Google TTS", samples)
results["MMS_TTS"]  = evaluate_tts_model(mms_tts_func, "Meta MMS-TTS (facebook/mms-tts-eng)", samples)


Evaluating Coqui TTS...


Coqui TTS:   0%|          | 0/15 [00:00<?, ?it/s]

 > Text splitted to sentences.
['Better person.']
 > Processing time: 2.996001720428467
 > Real-time factor: 0.385694990281689


Coqui TTS:   7%|▋         | 1/15 [00:04<00:57,  4.08s/it]

 > Text splitted to sentences.
["Let's go outside to play."]
 > Processing time: 0.9240012168884277
 > Real-time factor: 0.31826772732426006


Coqui TTS:  13%|█▎        | 2/15 [00:05<00:35,  2.70s/it]

 > Text splitted to sentences.
['An uncle is reading a story.']
 > Processing time: 0.8029990196228027
 > Real-time factor: 0.2917566633054772


Coqui TTS:  20%|██        | 3/15 [00:07<00:26,  2.21s/it]

 > Text splitted to sentences.
['A cat hiding in a closet.']
 > Processing time: 0.7709982395172119
 > Real-time factor: 0.29905205427375675


Coqui TTS:  27%|██▋       | 4/15 [00:09<00:21,  1.96s/it]

 > Text splitted to sentences.
['Cows on a farm.']
 > Processing time: 0.4719984531402588
 > Real-time factor: 0.2691240662945466


Coqui TTS:  33%|███▎      | 5/15 [00:10<00:16,  1.70s/it]

 > Text splitted to sentences.
['A man tries to dry many clothes in a dryer.']
 > Processing time: 1.011997938156128
 > Real-time factor: 0.302594848887267


Coqui TTS:  40%|████      | 6/15 [00:12<00:15,  1.77s/it]

 > Text splitted to sentences.
['The baby is crying and hungry.']
 > Processing time: 0.7269992828369141
 > Real-time factor: 0.28715846565194103


Coqui TTS:  47%|████▋     | 7/15 [00:13<00:13,  1.70s/it]

 > Text splitted to sentences.
['The shower is dirty.']
 > Processing time: 0.466001033782959
 > Real-time factor: 0.2571916999127514


Coqui TTS:  53%|█████▎    | 8/15 [00:14<00:10,  1.55s/it]

 > Text splitted to sentences.
['A girl is dancing to music.']
 > Processing time: 0.7079997062683105
 > Real-time factor: 0.2783772026251114


Coqui TTS:  60%|██████    | 9/15 [00:16<00:09,  1.55s/it]

 > Text splitted to sentences.
['A girl hugging a puppy.']
 > Processing time: 0.6380026340484619
 > Real-time factor: 0.29066029092497075


Coqui TTS:  67%|██████▋   | 10/15 [00:17<00:07,  1.52s/it]

 > Text splitted to sentences.
['Fish swimming in a tank.']
 > Processing time: 0.68499755859375
 > Real-time factor: 0.30092835844342103


Coqui TTS:  73%|███████▎  | 11/15 [00:19<00:06,  1.50s/it]

 > Text splitted to sentences.
['A baby crying in a bed.']
 > Processing time: 0.764000415802002
 > Real-time factor: 0.3031638563279971


Coqui TTS:  80%|████████  | 12/15 [00:21<00:04,  1.54s/it]

 > Text splitted to sentences.
['A boy is running on the grass.']
 > Processing time: 0.7930004596710205
 > Real-time factor: 0.29181675793968626


Coqui TTS:  87%|████████▋ | 13/15 [00:22<00:03,  1.56s/it]

 > Text splitted to sentences.
['A teacher talking to a class.']
 > Processing time: 0.8086016178131104
 > Real-time factor: 0.29503682937481934


Coqui TTS:  93%|█████████▎| 14/15 [00:24<00:01,  1.58s/it]

 > Text splitted to sentences.
['Mom says no.']
 > Processing time: 0.46700000762939453
 > Real-time factor: 0.2577430458607366


Coqui TTS: 100%|██████████| 15/15 [00:25<00:00,  1.70s/it]


Coqui TTS Results:
   Samples processed: 15/15
   Exact matches: 10/15 (66.7%)
   WER: 0.1975 (19.75%)
   CER: 0.2074 (20.74%)
   Word Accuracy: 80.25%
   Character Accuracy: 79.26%

Evaluating Google TTS...


Google TTS: 100%|██████████| 15/15 [00:19<00:00,  1.28s/it]


Google TTS Results:
   Samples processed: 15/15
   Exact matches: 13/15 (86.7%)
   WER: 0.0123 (1.23%)
   CER: 0.0085 (0.85%)
   Word Accuracy: 98.77%
   Character Accuracy: 99.15%

Evaluating Meta MMS-TTS (facebook/mms-tts-eng)...


Meta MMS-TTS (facebook/mms-tts-eng): 100%|██████████| 15/15 [00:18<00:00,  1.23s/it]

Meta MMS-TTS (facebook/mms-tts-eng) Results:
   Samples processed: 15/15
   Exact matches: 6/15 (40.0%)
   WER: 0.2593 (25.93%)
   CER: 0.1562 (15.62%)
   Word Accuracy: 74.07%
   Character Accuracy: 84.38%





In [14]:
# Summarize results in tabular format
summary_rows = []
for k, v in results.items():
    if v:
        summary_rows.append({
            "Model": v['model'],
            "Samples": v['samples_processed'],
            "Exact Match %": f"{v['exact_match_rate']:.1f}",
            "WER %": f"{v['wer']*100:.2f}",
            "CER %": f"{v['cer']*100:.2f}",
            "Word Acc %": f"{v['word_accuracy']:.2f}",
            "Char Acc %": f"{v['char_accuracy']:.2f}",
        })

df_summary = pd.DataFrame(summary_rows)
df_summary

Unnamed: 0,Model,Samples,Exact Match %,WER %,CER %,Word Acc %,Char Acc %
0,Coqui TTS,15,66.7,19.75,20.74,80.25,79.26
1,Google TTS,15,86.7,1.23,0.85,98.77,99.15
2,Meta MMS-TTS (facebook/mms-tts-eng),15,40.0,25.93,15.62,74.07,84.38
