In [52]:
# Set the languages here (only DE and EN currently)
languages = ["de", "en"] 

In [53]:
# Build up a database of all reference voices and all TTS voices

import os
import librosa
import numpy as np
from tqdm import tqdm

# Normalize sample rate
sample_rate = 44100

def extract_mfcc(audio_path, n_mfcc=13):
    y, sr = librosa.load(audio_path, sr=sample_rate)
    mfccs = librosa.feature.mfcc(y=y, sr=sample_rate, n_mfcc=n_mfcc)
    return mfccs

In [54]:
# Load reference voices
reference_voices = {}
for lang in languages:
    reference_voices[lang] = {}
    voices_dir = "voices/references/" + lang + "/"
    for file_ in tqdm(os.listdir(voices_dir)):
        if not file_.endswith(".wav"):
            continue
        file = file_[:-4]
        name = file.split("_")[0]
        gender = file.split("_")[1]

        mfcc = extract_mfcc(voices_dir + file_)
        # print(lang, name, gender)
        reference_voices[lang][name] = {
            "name": name,
            "language": lang,
            "gender": gender,
            "mfcc": mfcc
        }

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:02<00:00, 37.13it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:01<00:00, 38.96it/s]


In [55]:
# Load TTS voices
tts_voices = {} 
for lang in languages:
    tts_voices[lang] = {}
    voices_dir = "voices/tts/" + lang + "/"

    for file_ in tqdm(os.listdir(voices_dir)):
        if not file_.endswith(".wav"):
            continue
        file = file_[:-4]
        model, mlang, speaker, gender = file.split("_")

        mfcc = extract_mfcc(voices_dir + file_)
        tts_voices[lang][file] = {
            "name": file,
            "model": model,
            "language": lang,
            "gender": gender,
            "speaker": speaker,
            "mfcc": mfcc
        }

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 237/237 [00:05<00:00, 40.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:02<00:00, 47.80it/s]


In [56]:
def pad_or_truncate_mfcc(mfcc, target_shape):
    # Pad or truncate the MFCC sequence to match the target shape
    if mfcc.shape[1] < target_shape[1]:       
        # in_shape = mfcc.shape
        return np.pad(mfcc, ((0, 0), (0, target_shape[1] - mfcc.shape[1])), mode='constant')
        # print("padding", in_shape, target_shape, target_shape[1] - mfcc.shape[1], mfcc.shape)
    elif mfcc.shape[1] > target_shape[1]:
        return mfcc[:, :target_shape[1]]
    return mfcc

# Match each character to a TTS voice
for lang in languages:
    for c_name in reference_voices[lang]:
        reference = reference_voices[lang][c_name]
        c_gender = reference["gender"]
        c_mfcc = reference["mfcc"]
        
        best = None
        best_score = float("inf")
        
        for tts_name in tts_voices[lang]:
            tts = tts_voices[lang][tts_name]
            tts_mfcc = tts["mfcc"]            
            
            if tts["gender"] == c_gender:
                
                distance = np.linalg.norm(c_mfcc - pad_or_truncate_mfcc(tts_mfcc, c_mfcc.shape))
                
                if distance < best_score:
                    best = tts
                    best_score = distance
                    
        # print(c_name, best["name"], best_score)
        reference["tts"] = best
        

In [57]:
# Build characters database
characters = {}

for lang in languages:
    for c_name in reference_voices[lang]:
        if c_name not in characters:
            characters[c_name] = {
                "name": c_name,
                "gender": reference_voices[lang][c_name]["gender"],
                "tts": {}
            }
        tts = reference_voices[lang][c_name]["tts"]
        characters[c_name]["tts"][lang] = {
            "model": tts["model"],
            "language": tts["language"],
            "speaker": tts["speaker"],
        }

import json
with open("../lfffxivtts/resources/characters.json", "w") as f:
    json.dump(characters, f, indent=4)

In [58]:
# Export voices database
from natsort import natsorted
tts_voices_ = {}

for lang in languages:
    tts_voices_[lang] = []
    for tts in tts_voices[lang].values():
        tts_ = tts.copy()
        del tts_["mfcc"]
        tts_voices_[lang].append(tts_)
    tts_voices_[lang] = natsorted(tts_voices_[lang], key=lambda x: x["name"])
    
import json
with open("../lfffxivtts/resources/voices.json", "w") as f:
    json.dump(tts_voices_, f, indent=4)