In [2]:
from pathlib import Path
import pickle
import torch
from torch.utils.data import DataLoader
import yaml
from langdetect import detect
import lightning as L
from IPython.display import Audio

from modules import ParrotDataset, Parrot
from utils.aligner.cleaners import english_cleaners, nonenglish_cleaners

In [3]:
device = torch.device('cuda:2' if torch.cuda.is_available() and 'cuda' in 'cuda:2' else "cpu")

Load symbols

In [4]:
with open("runs/aligner/symbols.pkl", "rb") as f: 
    symbols = pickle.load(f)
idx_to_token = {i: s for i, s in enumerate(symbols, start=1)}
token_to_idx = {s: i for i, s in idx_to_token.items()}

Write a text to be converted to speech

In [10]:
text = "Hello my name is Parrot-TTS system. This is my resting place. Please feel free to invoke me at your convenience."

# Perform text cleaning
if detect(text) == 'en':
    use_englishcleaners = True
else:
    use_englishcleaners = False
cleaned_text = english_cleaners(text) if use_englishcleaners else nonenglish_cleaners(text)

# convert text into characters
characters = [c for c in cleaned_text if c in token_to_idx]
characters = ['sil' if char == ' ' else char for char in characters]
print(f"characters: {characters}")

characters: ['h', 'e', 'l', 'l', 'o', 'sil', 'm', 'y', 'sil', 'n', 'a', 'm', 'e', 'sil', 'i', 's', 'sil', 'p', 'a', 'r', 'r', 'o', 't', 't', 't', 's', 'sil', 's', 'y', 's', 't', 'e', 'm', '.', 'sil', 't', 'h', 'i', 's', 'sil', 'i', 's', 'sil', 'm', 'y', 'sil', 'r', 'e', 's', 't', 'i', 'n', 'g', 'sil', 'p', 'l', 'a', 'c', 'e', '.', 'sil', 'p', 'l', 'e', 'a', 's', 'e', 'sil', 'f', 'e', 'e', 'l', 'sil', 'f', 'r', 'e', 'e', 'sil', 't', 'o', 'sil', 'i', 'n', 'v', 'o', 'k', 'e', 'sil', 'm', 'e', 'sil', 'a', 't', 'sil', 'y', 'o', 'u', 'r', 'sil', 'c', 'o', 'n', 'v', 'e', 'n', 'i', 'e', 'n', 'c', 'e', '.']


Run TTE module: Predict HuBERT codes from input characters

In [6]:
# Load config file
data_config = yaml.load(open("utils/TTE/TTE_config.yaml", "r"), Loader=yaml.FullLoader)

# Model
class LitParrot(L.LightningModule):
    
    # define model architecture
    def __init__(
        self, data_config, src_vocab_size, src_pad_idx
    ):
        super().__init__()
        self.save_hyperparameters()
        self.parrot = Parrot(data_config, src_vocab_size, src_pad_idx)
    
    def infer(self, batch):
        self.eval()
        res = self.parrot.infer(batch)
        return res
    
# pre-requisites and store at "runs/TTE/val.txt"
processed_lines = []
data_dict = {}
data_dict['audio'] = 'random.wav'
data_dict['hubert'] = '1 2 3'
data_dict['duration'] = "1 2 3"
data_dict['speaker'] = 'en_m'
data_dict['characters'] = " ".join(characters) 
processed_lines.append(data_dict)

with open("runs/TTE/val.txt", 'w') as f:
    for line in processed_lines:
        f.write(str(line) + "\n")

# initialize dataloader
val_dataset = ParrotDataset("val", data_config=data_config)
val_loader = DataLoader(
        val_dataset,
        batch_size=1,
        collate_fn=val_dataset.collate_fn,
        num_workers=4,
    )

# load TTE checkpoint
checkpoint = "runs/TTE/ckpt/parrot_model-step=11000-val_total_loss_step=0.00.ckpt"

# init the model
model = LitParrot.load_from_checkpoint(checkpoint,weights_only=True)

# Move model to the correct device
model = model.to(device)

processed_lines = []
batch = next(iter(val_loader))  # Get the single batch from the loader

with torch.no_grad():  # Disable gradient calculations for inference
    # Move batch to the same device as the model
    batch = {key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in batch.items()}
    
    # Perform inference
    codes = ' '.join(map(str, model.infer(batch)[0]))

    # Create a dictionary with the results
    data_dict = {
        'audio': "random.wav",
        'hubert': codes,
        'duration': 0
    }

    # Append the processed line
    processed_lines.append(data_dict)

# Save the results to a file
with open("runs/TTE/predictions.txt", 'w') as f:
    for line in processed_lines:
        f.write(str(line) + "\n")

/home/vishal/anaconda3/envs/parrottts/lib/python3.8/site-packages/lightning/fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Run vocoder model: Predict speech from the predictions.txt

In [7]:
!python utils/vocoder/inference.py --checkpoint_file runs/vocoder/checkpoints --vc --input_code_file runs/TTE/predictions.txt --output_dir runs/vocoder/generations_tte

Initializing Inference Process..
id_to_spkr: ['random.wav']
spkr_to_id: {'random.wav': 0}
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
  WeightNorm.apply(module, name, dim)
Process ForkPoolWorker-7:
Traceback (most recent call last):
  File "/home/vishal/anaconda3/envs/parrottts/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/vishal/anaconda3/envs/parrottts/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/vishal/anaconda3/envs/parrottts/lib/python3.8/multiprocessing/pool.py", line 109, in worker
    initializer(*initargs)
  File "utils/vocoder/inference.py", line 105, in init_worker
    generator = CodeGenerator(h).to(idx)
  File "/home/vishal/anaconda

In [9]:
# The predicted speech is stored at generations_tte
# select any audio file available at audio_path
audio_path = 'runs/vocoder/generations_tte/random_en_m_gen.wav'
Audio(audio_path)