In [1]:
from pathlib import Path
import pickle
import torch
from torch.utils.data import DataLoader
import yaml
from langdetect import detect
import lightning as L
from IPython.display import Audio

from modules import ParrotDataset, Parrot
from utils.aligner.cleaners import english_cleaners, nonenglish_cleaners

In [2]:
device = torch.device('cuda:2' if torch.cuda.is_available() and 'cuda' in 'cuda:2' else "cpu")

Load symbols

In [3]:
with open("runs/aligner/symbols.pkl", "rb") as f: 
    symbols = pickle.load(f)
idx_to_token = {i: s for i, s in enumerate(symbols, start=1)}
token_to_idx = {s: i for i, s in idx_to_token.items()}

Write a text to be converted to speech

In [4]:
text = "Hello my name is neil. I am standing in a queue. I am currently placed number 20 in a queue."

# Perform text cleaning
if detect(text) == 'en':
    use_englishcleaners = True
else:
    use_englishcleaners = False
cleaned_text = english_cleaners(text) if use_englishcleaners else nonenglish_cleaners(text)

# convert text into characters
characters = [c for c in cleaned_text if c in token_to_idx]
characters = ['sil' if char == ' ' else char for char in characters]

Run TTE module: Predict HuBERT codes from input characters

In [None]:
# Load config file
data_config = yaml.load(open("utils/TTE/TTE_config.yaml", "r"), Loader=yaml.FullLoader)

# Model
class LitParrot(L.LightningModule):
    
    # define model architecture
    def __init__(
        self, data_config, src_vocab_size, src_pad_idx
    ):
        super().__init__()
        self.save_hyperparameters()
        self.parrot = Parrot(data_config, src_vocab_size, src_pad_idx)
    
    def infer(self, batch):
        self.eval()
        res = self.parrot.infer(batch)
        return res
    
# pre-requisites and store at "runs/TTE/val.txt"
processed_lines = []
data_dict = {}
data_dict['audio'] = 'random.wav'
data_dict['hubert'] = '1 2 3'
data_dict['duration'] = "1 2 3"
data_dict['speaker'] = 'en_m'
data_dict['characters'] = " ".join(characters) 
processed_lines.append(data_dict)

with open("runs/TTE/val.txt", 'w') as f:
    for line in processed_lines:
        f.write(str(line) + "\n")

# initialize dataloader
val_dataset = ParrotDataset("val", data_config=data_config)
val_loader = DataLoader(
        val_dataset,
        batch_size=1,
        collate_fn=val_dataset.collate_fn,
        num_workers=4,
    )

# load TTE checkpoint
checkpoint = "runs/TTE/ckpt/parrot_model-step=11000-val_total_loss_step=0.00.ckpt"

# init the model
model = LitParrot.load_from_checkpoint(checkpoint,weights_only=True)

# Move model to the correct device
model = model.to(device)

processed_lines = []
batch = next(iter(val_loader))  # Get the single batch from the loader

with torch.no_grad():  # Disable gradient calculations for inference
    # Move batch to the same device as the model
    batch = {key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in batch.items()}
    
    # Perform inference
    codes = ' '.join(map(str, model.infer(batch)[0]))

    # Create a dictionary with the results
    data_dict = {
        'audio': "random.wav",
        'hubert': codes,
        'duration': 0
    }

    # Append the processed line
    processed_lines.append(data_dict)

# Save the results to a file
with open("runs/TTE/predictions.txt", 'w') as f:
    for line in processed_lines:
        f.write(str(line) + "\n")

Run vocoder model: Predict speech from the predictions.txt

In [None]:
!python utils/vocoder/inference.py --checkpoint_file runs/vocoder/checkpoints --vc --input_code_file runs/TTE/predictions.txt --output_dir runs/vocoder/generations_tte

In [None]:
# The predicted speech is stored at generations_tte
# select any audio file available at audio_path
audio_path = 'runs/vocoder/generations_tte/random_en_f_gen.wav'
Audio(audio_path)