In [1]:
from pathlib import Path
import pickle
import torch
import yaml
from langdetect import detect
import gdown
import json
import os
import numpy as np
import librosa
from scipy.io.wavfile import write

from modules.data import DFATokenizer, get_mask_from_batch
from utils.aligner.cleaners import english_cleaners, nonenglish_cleaners, nonenglish_cleaners_no_transliteration
from inference import LitParrot
from utils.vocoder.utils import AttrDict
from utils.vocoder.models import CodeGenerator
from utils.vocoder.dataset import MAX_WAV_VALUE

Decide whether to download `transliterad` / `no_transliterated` models. `no_transliterated` means non-English characters are used as it is without being transliterated to English characters.

We would recommend switching `transliteration` to False for non-English languages.

In [2]:
transliteration=False # if False, runs ParrotTTS using on original non-English characters. if True, runs ParrotTTS using non-English characters transliterated to English characters

Download symbol and checkpoints

In [3]:
# Define directories
symbols_path = "runs/aligner/symbols.pkl"
TTE_checkpoint = "runs/TTE/ckpt/parrot_model-step=11000-val_total_loss_step=0.00.ckpt"
speaker_json_path = "runs/TTE/speakers.json"

vocoder_checkpoint = "runs/vocoder/checkpoints/g_00750000"
store_processed_audio_path = "runs/vocoder/generations_tte"

if not os.path.exists(store_processed_audio_path):
    os.makedirs(store_processed_audio_path)

# Create necessary directories if they don't exist
for path in ["runs/aligner", "runs/TTE/ckpt", "runs/vocoder/checkpoints"]:
    os.makedirs(path, exist_ok=True)

# Google Drive direct download link
if transliteration:
    
    print("Downloading models trained using transliterated characters")

    # Download symbols.pkl having just English characters
    file_id = "1D-RXBf_n1pQlJ5gvsGjjVPkDX1Ps6igp"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, symbols_path, quiet=False)

    # Download TTE checkpoint that maps characters to HuBERT units
    file_id = "1YCILO6lRqiB9_Po-vbeJKZ-G_UxOEEgD"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, TTE_checkpoint, quiet=False)

    # Download speaker ID mapper
    file_id = "1KiTYQGPPXbOgXEdw4ka6YMoyEcpFiy9p"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, speaker_json_path, quiet=False)

else:

    print("Downloading models trained using non-transliterated (as it is) characters")

    # Download symbols.pkl having both English and original non-English characters
    file_id = "1seuLFK_xYyqyYXAsHGjvGlMiqnwHa54G"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, symbols_path, quiet=False)

    # Download TTE checkpoint that maps characters to HuBERT units
    file_id = "1RUX3ZTHXGW5Ke6r7fwWMykN2bOGpVsss"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, TTE_checkpoint, quiet=False)

    # Download speaker ID mapper
    file_id = "1k73yYtfHM2SN9x-g2c0Ep8QmamtnceJE"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, speaker_json_path, quiet=False)

# Download Hifi-GAN vocoder checkpoint trained on 10 speakers and 5 languages (English, Hindi, Gujarati, Bhojpuri, Kannada)
file_id = "1PTk42aTOKn6P7FgBmReXWdGj4hSz0NN8"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, vocoder_checkpoint, quiet=False)

Downloading models trained using non-transliterated (as it is) characters


Downloading...
From: https://drive.google.com/uc?id=1seuLFK_xYyqyYXAsHGjvGlMiqnwHa54G
To: /media/newhd/Neil/Parrot-TTS/runs/aligner/symbols.pkl
100%|██████████| 1.72k/1.72k [00:00<00:00, 1.02MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1RUX3ZTHXGW5Ke6r7fwWMykN2bOGpVsss
From (redirected): https://drive.google.com/uc?id=1RUX3ZTHXGW5Ke6r7fwWMykN2bOGpVsss&confirm=t&uuid=d95dfd56-767d-4fe0-b83e-1de80f4aaa14
To: /media/newhd/Neil/Parrot-TTS/runs/TTE/ckpt/parrot_model-step=11000-val_total_loss_step=0.00.ckpt
100%|██████████| 315M/315M [00:56<00:00, 5.57MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1k73yYtfHM2SN9x-g2c0Ep8QmamtnceJE
To: /media/newhd/Neil/Parrot-TTS/runs/TTE/speakers.json
100%|██████████| 112/112 [00:00<00:00, 242kB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1PTk42aTOKn6P7FgBmReXWdGj4hSz0NN8
From (redirected): https://drive.google.com/uc?id=1PTk42aTOKn6P7FgBmReXWdGj4hSz0NN8&confirm=t&uuid=03045088-37fc-4be4-b00d-7da4342

'runs/vocoder/checkpoints/g_00750000'

In [4]:
device = torch.device('cuda:2' if torch.cuda.is_available() and 'cuda' in 'cuda:2' else "cpu")

Load symbols

In [5]:
with open(symbols_path, "rb") as f: 
    symbols = pickle.load(f)
idx_to_token = {i: s for i, s in enumerate(symbols, start=1)}
token_to_idx = {s: i for i, s in idx_to_token.items()}
print(f"token_to_idx: {token_to_idx}")

token_to_idx: {' ': 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'C': 7, 'D': 8, 'E': 9, 'H': 10, 'I': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'Y': 22, 'Z': 23, 'a': 24, 'b': 25, 'c': 26, 'd': 27, 'e': 28, 'f': 29, 'g': 30, 'h': 31, 'i': 32, 'j': 33, 'k': 34, 'l': 35, 'm': 36, 'n': 37, 'o': 38, 'p': 39, 'q': 40, 'r': 41, 's': 42, 't': 43, 'u': 44, 'v': 45, 'w': 46, 'x': 47, 'y': 48, 'z': 49, '°': 50, '·': 51, 'º': 52, '½': 53, 'â': 54, 'ʼ': 55, 'ँ': 56, 'ं': 57, 'ः': 58, 'अ': 59, 'आ': 60, 'इ': 61, 'ई': 62, 'उ': 63, 'ऊ': 64, 'ऋ': 65, 'ऍ': 66, 'ऎ': 67, 'ए': 68, 'ऐ': 69, 'ऑ': 70, 'ऒ': 71, 'ओ': 72, 'औ': 73, 'क': 74, 'ख': 75, 'ग': 76, 'घ': 77, 'ङ': 78, 'च': 79, 'छ': 80, 'ज': 81, 'झ': 82, 'ञ': 83, 'ट': 84, 'ठ': 85, 'ड': 86, 'ढ': 87, 'ण': 88, 'त': 89, 'थ': 90, 'द': 91, 'ध': 92, 'न': 93, 'ऩ': 94, 'प': 95, 'फ': 96, 'ब': 97, 'भ': 98, 'म': 99, 'य': 100, 'र': 101, 'ऱ': 102, 'ल': 103, 'ळ': 104, 'व': 105, 'श': 106, 'ष': 107, 'स': 108, 'ह': 109, 'ऺ'

Write a text to be converted to speech

In [6]:
text = "इसके लिए संरक्षित खेती, मधुमक्खी पालन इत्यादि पर अगले पंचवर्षीय योजना में महत्त्व देना होगा"

# Perform text cleaning
if detect(text) == 'en':
    use_englishcleaners = True
else:
    use_englishcleaners = False

cleaners = english_cleaners if use_englishcleaners else nonenglish_cleaners_no_transliteration if not transliteration else nonenglish_cleaners
cleaned_text = cleaners(text)

# convert text into characters
characters = [c for c in cleaned_text if c in token_to_idx]
characters = ['sil' if char == ' ' else char for char in characters]
print(f"characters: {characters}")

characters: ['इ', 'स', 'क', 'े', 'sil', 'ल', 'ि', 'ए', 'sil', 'स', 'ं', 'र', 'क', '्', 'ष', 'ि', 'त', 'sil', 'ख', 'े', 'त', 'ी', ',', 'sil', 'म', 'ध', 'ु', 'म', 'क', '्', 'ख', 'ी', 'sil', 'प', 'ा', 'ल', 'न', 'sil', 'इ', 'त', '्', 'य', 'ा', 'द', 'ि', 'sil', 'प', 'र', 'sil', 'अ', 'ग', 'ल', 'े', 'sil', 'प', 'ं', 'च', 'व', 'र', '्', 'ष', 'ी', 'य', 'sil', 'य', 'ो', 'ज', 'न', 'ा', 'sil', 'म', 'े', 'ं', 'sil', 'म', 'ह', 'त', '्', 'त', '्', 'व', 'sil', 'द', 'े', 'न', 'ा', 'sil', 'ह', 'ो', 'ग', 'ा']


Run TTE module: Predict HuBERT codes from input characters

In [7]:
# Load config file
data_config = yaml.load(open("utils/TTE/TTE_config.yaml", "r"), Loader=yaml.FullLoader)

# init the model
model = LitParrot.load_from_checkpoint(TTE_checkpoint,weights_only=True)

# Move model to the correct device
model = model.to(device)

# Load tokenizer
tokenizer = DFATokenizer(Path(data_config["path"]["alignment_path"]))

# Manually pad sequences and create data dictionary
data = {
    'ids': 'random',
    'speaker': torch.tensor([0], dtype=torch.long),
    'phones': torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(tokenizer.tokenize(" ".join(characters).split(' ')), dtype=torch.long)], batch_first=True, padding_value=tokenizer.pad_idx
    ),
    'codes': torch.nn.utils.rnn.pad_sequence(
        [torch.tensor([int(i) for i in '1'.split(' ')], dtype=torch.long)], batch_first=True, padding_value=data_config["preprocess"]["hubert_codes"]
    ),
    'duration': torch.nn.utils.rnn.pad_sequence([torch.tensor([int(i) for i in '1'.split(' ')], dtype=torch.long)], batch_first=True),
    'src_mask': get_mask_from_batch(torch.nn.utils.rnn.pad_sequence([torch.tensor(tokenizer.tokenize(" ".join(characters).split(' ')), dtype=torch.long)], 
                                                                    batch_first=True, padding_value=tokenizer.pad_idx), tokenizer.pad_idx)
}

# Infer using TTE model
with torch.no_grad():  # Disable gradient calculations for inference
    # Move batch to the same device as the model
    batch = {key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in data.items()}
    
    # Perform inference
    codes = ' '.join(map(str, model.infer(batch)[0]))

/home/vishal/anaconda3/envs/parrottts/lib/python3.8/site-packages/lightning/fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Run vocoder module: generate speech from predicted HuBERT codes

In [8]:
# load config file
config_file = "utils/vocoder/config.json"
with open(config_file) as f:
    data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)

# load checkpoint
generator = CodeGenerator(h).to(device)
state_dict_g = torch.load(vocoder_checkpoint, map_location=device)
generator.load_state_dict(state_dict_g['generator'])
generator.to(device)

# Preprocess codes
result = {
    'code': torch.tensor(np.array([int(num) for num in codes.split()])).unsqueeze(0).to(device),
    'spkr': torch.tensor([0]).unsqueeze(0).to(device)  # Shape: (1,)
}

spkr_to_id = {'bho_f': 0, 'bho_m': 1, 'en_f': 2, 'en_m': 3, 'gu_f': 4, 'gu_m': 5, 'hi_f': 6, 'hi_m': 7, 'kn_f': 8, 'kn_m': 9}
local_spkrs = list(spkr_to_id.values())

def generate(codess):
    y_g_hat = generator(**codess)
    if type(y_g_hat) is tuple:
        y_g_hat = y_g_hat[0]
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.detach().cpu().numpy().astype('int16')
    return audio

print(f"Generating audios at {store_processed_audio_path}")

for spkr_i, k in enumerate(local_spkrs):
    result['spkr'] = torch.tensor([k]).unsqueeze(0).to(device)
    audio = generate(result)

    key_found = next((key for key, value in spkr_to_id.items() if value == k), None)
    output_file = os.path.join(store_processed_audio_path  + f'/result_{key_found}_gen.wav')
    audio = librosa.util.normalize(audio.astype(np.float32))
    write(output_file, h.sampling_rate, audio)


  WeightNorm.apply(module, name, dim)
  state_dict_g = torch.load(vocoder_checkpoint, map_location=device)


Generating audios at runs/vocoder/generations_tte
