In [1]:
import ipywidgets as widgets
import IPython
import json
from IPython.core.display import display
import os
import torch
from transformers import AutoTokenizer
from pathlib import Path
import json
import nltk
from nemo_text_processing.text_normalization.normalize import Normalizer
from nltk.tokenize import sent_tokenize, TweetTokenizer
from g2p_en import G2p
import soundfile as sf
import numpy as np
from tqdm import tqdm
import re
from numba import cuda
nltk.download('punkt')

""" global speaker, talking_speed, text, button_clear, output_clear, button, output """

acoustic_model = torch.jit.load(Path(".") / "audio_scripts" / "torchscript" / "acoustic_model.pt")
style_predictor = torch.jit.load(Path(".") / "audio_scripts" / "torchscript" / "style_predictor.pt")        
vocoder = torch.jit.load(Path(".") / "audio_scripts" / "torchscript" / "vocoder.pt")
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

punctuation = """\"#$%&'()*+-/:;<=>?@[\\]^_`{|}~"""
punctuation2 = """'"""

def read_txt(txt):
    text_splited = []
    text_split = " ".join(txt.split()).split(".")
    first = text_split[0]
    temp_str = ""    
    
    for i in reversed(text_split):        
        if i == first:            
            text_splited.append(i + temp_str + ".")
        elif(i != " " and i != ""):            
            if(i[0].startswith(" ")):
                temp = i + temp_str + "."
                temp = temp[1:]       
                text_splited.append(temp)
                temp_str = ""
            else:                
                temp_str = i + temp_str 
    return reversed(text_splited)

EXTERNAL = True

if EXTERNAL:
  %store -r TEXT_VOICE_GEN
  %store -r VISUALIZE
  %store -r SAVE_FOLDER
  %store -r PROJECT_NAME
  %store -r SHOW_OUTPUT
  %store -r SPEAKER
  %store -r TALKING_SPEED
  %store -r SECONDS_SILENCE
  %store -r RECORD_AUDIO_AFTER
else:
  VISUALIZE = True  
  SAVE_FOLDER = "D:\\Deletar\\p_gen"
  PROJECT_NAME = "THE FELLOWSHIP OF THE RING"
  SHOW_OUTPUT = True
  SPEAKER = "Naoto Shirogane"
  TALKING_SPEED = 0.9
  SECONDS_SILENCE = 0.25
  TEXT_VOICE_GEN = """ 
The story takes place in the fictional world where an entity named Sauron, the Dark Lord, seeks to reclaim the One Ring, a powerful ring that contains much of his power, in order to enslave all of Middle-earth.
The story starts with events from The Hobbit, where Bilbo Baggins, a hobbit, stumbles upon the One Ring in a cave and takes 
it back to the Shire, the home of the hobbits, unaware of its significance. 
Bilbo's 111th birthday party is where the story of The Fellowship of the Ring begins, Bilbo gives his ring to his heir, his cousin Frodo Baggins, but only after the urging of his friend Gandalf the Grey, a great Wizard who suspects it to be the One Ring of legend.
"""


""" removed_punctuation = ""

for i in TEXT:
  print(i)
  if i not in punctuation:
    removed_punctuation += i
  elif i in punctuation2:
    removed_punctuation += ""
  else:
    removed_punctuation += " "




TEXT = read_txt(removed_punctuation) """

saved_folder = os.path.exists(SAVE_FOLDER)
if (not saved_folder):
  os.makedirs(saved_folder)

project_folder = f"{SAVE_FOLDER}\\{PROJECT_NAME}"

saved_project_folder = os.path.exists(project_folder)
if (not saved_project_folder):
  os.makedirs(project_folder)

  from IPython.core.display import display
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Matheus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def get_sentences(text):
    sentences = sent_tokenize(text)
    # ["What is this?", "?"] => ["What is this??"]
    merged_sentences = []
    for i, sentence in enumerate(sentences):
        if sentence in [".", "?", "!"]:
            continue
        for next_sentence in sentences[i + 1:]:
            if next_sentence in [".", "?", "!"]:
                sentence = sentence + next_sentence
            else:
                break
        merged_sentences.append(sentence)
    return merged_sentences

def split_arpabet(text):
    splits = re.finditer(r"{{(([^}][^}]?|[^}]}?)*)}}", text)
    out = []
    start = 0
    for split in splits:
        non_arpa = text[start:split.start()]
        arpa = text[split.start():split.end()]
        out = out + [non_arpa] + [arpa]
        start = split.end()
    if start < len(text):
        out.append(text[start:])
    return out

def is_arpabet(text):
    if len(text) < 4:
        return False
    return text[:2] == "{{" and text[-2:] == "}}" 

class DeepPoniesTTS():
    def __init__(self):
        self.g2p = G2p()        
        self.acoustic_model = acoustic_model
        self.style_predictor = style_predictor  
        self.vocoder = vocoder
        self.tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
        self.normalizer = Normalizer(input_case='cased', lang='en')
        self.speaker2id = self.get_speaker2id()
        self.symbol2id = self.get_symbol2id()
        self.lexicon = self.get_lexicon()
        self.word_tokenizer = TweetTokenizer()
        self.acoustic_model.eval()
        self.style_predictor.eval()
        self.vocoder.eval()

    def get_speaker2id(self):
        speaker2id = {}
        with open(Path(".") / "audio_scripts" / "assets" / "speakerCategories.json", "r") as json_file:
            data = json.load(json_file)
        for category in data.keys():
            for item in data[category]["items"]:
                if not item["activated"]:
                    continue
                speaker2id[item["speaker"]] = item["speaker_id"]
        return speaker2id

    def get_symbol2id(self):
        with open(Path(".") / "audio_scripts" / "assets" / "symbol2id.json", "r") as json_file:
            symbol2id = json.load(json_file)
        return symbol2id

    def get_lexicon(self):
        dic = {}
        with open(Path(".") / "audio_scripts" / "assets" / "lexicon.txt", "r") as f:
            lines = f.readlines()
        for line in lines:
            split = line.rstrip().split(" ")
            text = split[0].strip()
            phones = split[1:]
            dic[text] = phones
        return dic

    def synthesize(self, text: str, speaker_name: str, duration_control: float=1.0, verbose: bool=True) -> np.ndarray:
        waves = []
        text = text.strip()
        speaker_ids = torch.LongTensor([self.speaker2id[speaker_name]]) 
        if text[-1] not in [".", "?", "!"]:
            text = text + "."

        sentences = get_sentences(text)
        if verbose:
            sentences = tqdm(sentences)
        for sentence in sentences:
            phone_ids = []
            subsentences_style = []
            for subsentence in split_arpabet(sentence):
                if is_arpabet(subsentence):
                    for phone in subsentence.strip()[2:-2].split(" "):
                        if "@" + phone in self.symbol2id:
                            phone_ids.append(self.symbol2id["@" + phone])
                else:
                    subsentences_style.append(subsentence)
                    subsentence = self.normalizer.normalize(subsentence, verbose=False)
                    for word in self.word_tokenizer.tokenize(subsentence):
                        word = word.lower()
                        if word in [".", "?", "!"]:
                            phone_ids.append(self.symbol2id[word])
                        elif word in [",", ";"]:
                            phone_ids.append(self.symbol2id["@SILENCE"])
                        elif word in self.lexicon:
                            for phone in self.lexicon[word]:
                                phone_ids.append(self.symbol2id["@" + phone])
                            phone_ids.append(self.symbol2id["@BLANK"])
                        else:
                            for phone in self.g2p(word):
                                phone_ids.append(self.symbol2id["@" + phone])
                            phone_ids.append(self.symbol2id["@BLANK"])
            
            subsentence_style = " ".join(subsentences_style)
            encoding = self.tokenizer(
                subsentence_style,
                add_special_tokens=True,
                padding=True, 
                return_tensors="pt"
            )
            input_ids = encoding["input_ids"]
            attention_mask = encoding["attention_mask"]
            phone_ids = torch.LongTensor([phone_ids])
            with torch.no_grad():
                style = self.style_predictor(input_ids, attention_mask)
                mels = self.acoustic_model(
                    phone_ids,
                    speaker_ids,
                    style,
                    1.0,
                    duration_control
                )
                wave = self.vocoder(mels, speaker_ids, torch.FloatTensor([1.0]))
                waves.append(wave.view(-1))
        full_wave = torch.cat(waves, dim=0).cpu().numpy()
        return full_wave
tts = DeepPoniesTTS()

[NeMo I 2023-01-31 12:08:59 tokenize_and_classify:87] Creating ClassifyFst grammars.


In [None]:
goods = """
Naoto Shirogane - 7,
Robot - 17,
Sniper - 26,
Nameless Hero - 39
"""

durations = []



for index, name_index in enumerate(TEXT_VOICE_GEN):
    if index > RECORD_AUDIO_AFTER:     
        if (SHOW_OUTPUT):
            print(index)    
            print(TEXT_VOICE_GEN[name_index])      

        if (not os.path.exists(f"{project_folder}\\audio\\{index:07d}")):
            os.makedirs(f"{project_folder}\\audio\\{index:07d}")

        audio = tts.synthesize(TEXT_VOICE_GEN[name_index], SPEAKER, 1 / TALKING_SPEED)
        
        
        """ sf.write(f"{project_folder}\\audio\\{index:07d}\\{index:05d}.wav", audio, 22050, format='flac', subtype='PCM_24') """
        
        num_samples = int(22050 * SECONDS_SILENCE)
        silence = np.zeros(num_samples)
        audio = np.concatenate((audio, silence))
        audio = np.concatenate((silence,audio))
        if (SHOW_OUTPUT):
            display(IPython.display.Audio(audio, rate=22050)) 
        sf.write(f"{project_folder}\\audio\\{index:07d}\\{index:05d}.wav", audio, 22050)
        
        """ sf.write("D:\\Deletar\\p_gen\\text.wav", audio, 22050, format='flac', subtype='PCM_24') """



cuda.select_device(0)
cuda.close()


The story takes place in the fictional world where an entity named Sauron, the Dark Lord, seeks to reclaim the One Ring, a powerful ring that contains much of his power, in order to enslave all of Middle earth.


100%|██████████| 1/1 [00:06<00:00,  6.23s/it]


The story starts with events from The Hobbit, where Bilbo Baggins, a hobbit, stumbles upon the One Ring in a cave and takes it back to the Shire, the home of the hobbits, unaware of its significance.


100%|██████████| 1/1 [00:07<00:00,  7.84s/it]


Bilbos 111th birthday party is where the story of The Fellowship of the Ring begins, Bilbo gives his ring to his heir, his cousin Frodo Baggins, but only after the urging of his friend Gandalf the Grey, a great Wizard who suspects it to be the One Ring of legend.


100%|██████████| 1/1 [00:07<00:00,  7.47s/it]
