In [1]:
from pathlib import Path
import pickle
import torch
from torch.utils.data import DataLoader
import yaml
from langdetect import detect
import lightning as L
import gdown
import json
import os
from IPython.display import Audio
import numpy as np
import time
import librosa
from scipy.io.wavfile import write

from modules.data import DFATokenizer, get_mask_from_batch
from utils.aligner.cleaners import english_cleaners, nonenglish_cleaners
from inference import LitParrot
from utils.vocoder.utils import AttrDict
from utils.vocoder.models import CodeGenerator
from utils.vocoder.dataset import MAX_WAV_VALUE

In [12]:
# load checkpoints and symbol pickle files trained on Limmits data
symbols_path = "runs/aligner/symbols.pkl"
TTE_checkpoint = "runs/TTE/ckpt/parrot_model-step=11000-val_total_loss_step=0.00.ckpt"
vocoder_checkpoint = "runs/vocoder/checkpoints/g_00750000"
speaker_json_path = "runs/TTE/speakers.json"

store_processed_audio_path = "runs/vocoder/generations"
if not os.path.exists(store_processed_audio_path):
    os.makedirs(store_processed_audio_path)

In [3]:
# Create necessary directories if they don't exist
for path in ["runs/aligner", "runs/TTE/ckpt", "runs/vocoder/checkpoints"]:
    os.makedirs(path, exist_ok=True)

# Google Drive direct download link
file_id = "1D-RXBf_n1pQlJ5gvsGjjVPkDX1Ps6igp"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, symbols_path, quiet=False)

file_id = "1PTk42aTOKn6P7FgBmReXWdGj4hSz0NN8"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, vocoder_checkpoint, quiet=False)

file_id = "1YCILO6lRqiB9_Po-vbeJKZ-G_UxOEEgD"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, TTE_checkpoint, quiet=False)

file_id = "1KiTYQGPPXbOgXEdw4ka6YMoyEcpFiy9p"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, speaker_json_path, quiet=False)

In [4]:
device = torch.device('cuda:2' if torch.cuda.is_available() and 'cuda' in 'cuda:2' else "cpu")

Load symbols

In [5]:
with open(symbols_path, "rb") as f: 
    symbols = pickle.load(f)
idx_to_token = {i: s for i, s in enumerate(symbols, start=1)}
token_to_idx = {s: i for i, s in idx_to_token.items()}

Write a text to be converted to speech

In [6]:
text = "Hello my name is Parrot-TTS. This is my resting place. Please feel free to invoke me at your convenience."

# Perform text cleaning
if detect(text) == 'en':
    use_englishcleaners = True
else:
    use_englishcleaners = False
cleaned_text = english_cleaners(text) if use_englishcleaners else nonenglish_cleaners(text)

# convert text into characters
characters = [c for c in cleaned_text if c in token_to_idx]
characters = ['sil' if char == ' ' else char for char in characters]
print(f"characters: {characters}")

characters: ['h', 'e', 'l', 'l', 'o', 'sil', 'm', 'y', 'sil', 'n', 'a', 'm', 'e', 'sil', 'i', 's', 'sil', 'p', 'a', 'r', 'r', 'o', 't', 't', 'e', 'x', 't', 'sil', 't', 'o', 'sil', 's', 'p', 'e', 'e', 'c', 'h', 'sil', 't', 'h', 'i', 's', 'sil', 'i', 's', 'sil', 'm', 'y', 'sil', 'r', 'e', 's', 't', 'i', 'n', 'g', 'sil', 'p', 'l', 'a', 'c', 'e', '.', 'sil', 'p', 'l', 'e', 'a', 's', 'e', 'sil', 'f', 'e', 'e', 'l', 'sil', 'f', 'r', 'e', 'e', 'sil', 't', 'o', 'sil', 'i', 'n', 'v', 'o', 'k', 'e', 'sil', 'm', 'e', 'sil', 'a', 't', 'sil', 'y', 'o', 'u', 'r', 'sil', 'c', 'o', 'n', 'v', 'e', 'n', 'i', 'e', 'n', 'c', 'e', '.']


Run TTE module: Predict HuBERT codes from input characters

In [7]:
# Load config file
data_config = yaml.load(open("utils/TTE/TTE_config.yaml", "r"), Loader=yaml.FullLoader)

# init the model
model = LitParrot.load_from_checkpoint(TTE_checkpoint,weights_only=True)

# Move model to the correct device
model = model.to(device)

# Load tokenizer
tokenizer = DFATokenizer(Path(data_config["path"]["alignment_path"]))

# Manually pad sequences and create data dictionary
data = {
    'ids': 'random',
    'speaker': torch.tensor([0], dtype=torch.long),
    'phones': torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(tokenizer.tokenize(" ".join(characters).split(' ')), dtype=torch.long)], batch_first=True, padding_value=tokenizer.pad_idx
    ),
    'codes': torch.nn.utils.rnn.pad_sequence(
        [torch.tensor([int(i) for i in '1'.split(' ')], dtype=torch.long)], batch_first=True, padding_value=data_config["preprocess"]["hubert_codes"]
    ),
    'duration': torch.nn.utils.rnn.pad_sequence([torch.tensor([int(i) for i in '1'.split(' ')], dtype=torch.long)], batch_first=True),
    'src_mask': get_mask_from_batch(torch.nn.utils.rnn.pad_sequence([torch.tensor(tokenizer.tokenize(" ".join(characters).split(' ')), dtype=torch.long)], 
                                                                    batch_first=True, padding_value=tokenizer.pad_idx), tokenizer.pad_idx)
}

# Infer using TTE model
with torch.no_grad():  # Disable gradient calculations for inference
    # Move batch to the same device as the model
    batch = {key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in data.items()}
    
    # Perform inference
    codes = ' '.join(map(str, model.infer(batch)[0]))

/home/vishal/anaconda3/envs/parrottts/lib/python3.8/site-packages/lightning/fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Run vocoder module: generate speech from predicted HuBERT codes

In [13]:
# load config file
config_file = "utils/vocoder/config.json"
with open(config_file) as f:
    data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)

# load checkpoint
generator = CodeGenerator(h).to(device)
state_dict_g = torch.load(vocoder_checkpoint, map_location=device)
generator.load_state_dict(state_dict_g['generator'])
generator.to(device)

# Preprocess codes
result = {
    'code': torch.tensor(np.array([int(num) for num in codes.split()])).unsqueeze(0).to(device),
    'spkr': torch.tensor([0]).unsqueeze(0).to(device)  # Shape: (1,)
}

spkr_to_id = {'bho_f': 0, 'bho_m': 1, 'en_f': 2, 'en_m': 3, 'gu_f': 4, 'gu_m': 5, 'hi_f': 6, 'hi_m': 7, 'kn_f': 8, 'kn_m': 9}
local_spkrs = list(spkr_to_id.values())

def generate(codess):
    y_g_hat = generator(**codess)
    if type(y_g_hat) is tuple:
        y_g_hat = y_g_hat[0]
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.detach().cpu().numpy().astype('int16')
    return audio

print(f"Generating audios at {store_processed_audio_path}")

for spkr_i, k in enumerate(local_spkrs):
    result['spkr'] = torch.tensor([k]).unsqueeze(0).to(device)
    audio = generate(result)

    key_found = next((key for key, value in spkr_to_id.items() if value == k), None)
    output_file = os.path.join(store_processed_audio_path  + f'/result_{key_found}_gen.wav')
    audio = librosa.util.normalize(audio.astype(np.float32))
    write(output_file, h.sampling_rate, audio)


  state_dict_g = torch.load(vocoder_checkpoint, map_location=device)
