In [None]:
from inference import StyleTTS2

import librosa
import IPython.display as ipd
import torch.cuda

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\catto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


### Load models

In [None]:
config_path = "Models/config.yml"
models_path = "Models/model.pth"

### Synthesize speech

Little Note:

- You don't need to add language tokens everywhere, espeak can detect and handle them automatically most of the time.

- Because of my preprocess method, the model speaks at a slower rate. Increase the speed value to greater than 1 to restore it to normal.

- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed.

- I would not set randomness, smooth_dur, or t_denoise too high, as they can distort the audio.

- For high‑quality reference audio, I recommend setting denoise to False.

In [None]:
speakers = {
    "id_1": {
        "path": "./reference_audio/vn_2.wav",   #Ref audio path
        "lang": "vi",                           #Default language
        "speed": 1.2,                           #Speaking speed
    },
    "id_2": {
        "path": "./reference_audio/vn_4.wav",
        "lang": "vi",
        "speed": 1.2,
    },
}
for id in speakers:
    max_samples = 24000*30 #max 30 seconds ref audio
    print(speakers[id]['path'])
    wave, sr = librosa.load(speakers[id]['path'], sr=24000)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != 24000:              audio = librosa.resample(audio, sr, 24000)
    if len(audio) > max_samples: audio = audio[:max_samples]
    display(ipd.Audio(audio, rate=24000, normalize=True))

./reference_audio/vn_2.wav


./reference_audio/vn_4.wav


In [None]:
text = '''
[id_1][en-us]{What's up hommie}, dạo này đang học tí [en-us]{English}. Thấy bảo [en-us]{Building a strong vocabulary} khá là quan trọng. [en-us]{Bro} thấy sao?

[id_2][en-us]{That's right}, tôi thấy [en-us]{bro} nên bắt đầu với việc đọc sách và báo tiếng Anh để quen với cách sử dụng từ, cũng như tập trung vào [en-us]{listening exercises} để cải thiện khả năng nghe.

[id_1]Nghe nói rằng [en-us]{speaking practice} là bước quan trọng để giao tiếp tự tin. [en-us]{Bro} có muốn luyện tập với tôi không?

[id_2][en-us]{For sure my hommie} à, cứ cho mình cái hẹn nhé.
'''

In [None]:
model             = StyleTTS2(config_path, models_path).eval().to(device)
default_speaker   = "[id_1]"  #INT    Default speaker used when no speaker_id is provided in the input
avg_style         = True      #BOOL   Split the ref audio and calculate the avg styles.
stabilize         = True      #BOOL   Stabilize speaking speed.
denoise           = 0.6       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]
n_merge           = 16        #INT    Avoid short sentences by merging when a sentence has fewer than n words

  WeightNorm.apply(module, name, dim)


decoder : 54289492
predictor : 16194612
text_encoder : 5612032
style_encoder : 13845440

Total : 89941576


In [None]:
with torch.no_grad():
    r = model.generate(text, speakers, avg_style, stabilize, denoise, n_merge, default_speaker)

print('Synthesized:')
display(ipd.Audio(r, rate=24000, normalize=True))

Computing the style for: ./reference_audio/vn_2.wav
Computing the style for: ./reference_audio/vn_4.wav
Generating Audio...
Synthesized:
