Run the next cell

you need T4 gpu to be able to test the model

In [None]:
!nvidia-smi -L

In [None]:
#@title Install Tacotron and Waveglow (click to see code)

import os
from os.path import exists, join, basename, splitext
!pip install gdown
git_repo_url = 'https://github.com/maxmelichov/tacotron2.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  # clone and install
  !git clone -q --recursive {git_repo_url}
  !cd {project_name}/waveglow && git checkout 2fd4e63
  !pip install -q librosa unidecode
  !pip install Hebrew
  
import sys
sys.path.append(join(project_name, 'waveglow/'))
sys.path.append(project_name)
import time
import matplotlib
import matplotlib.pylab as plt
import gdown
from hebrew import Hebrew
from hebrew.chars import HebrewChar, ALEPH
from hebrew import GematriaTypes
d = 'https://drive.google.com/uc?id='

The pre-trained model won't be shared until the end of the contest

In [None]:
force_download_TT2 = True
tacotron2_pretrained_model = 'MLPTTS'
if not exists(tacotron2_pretrained_model) or force_download_TT2:
                   # ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ PUT MODEL HERE
  gdown.download(d+r'None', tacotron2_pretrained_model, quiet=False); print("Tacotron2 Model Downloaded")
                   # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑ PUT MODEL HERE



In [None]:
waveglow_pretrained_model = 'waveglow.pt'
if not exists(waveglow_pretrained_model):
  gdown.download(d+r'1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF&export=download', waveglow_pretrained_model, quiet=False); print("WaveGlow Model Downloaded")#1okuUstGoBe_qZ4qUEF8CcwEugHP7GM_b&export

In [None]:
!python ./tacotron2/waveglow/convert_model.py /content/waveglow.pt /content/waveglow.pt

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
locale.getpreferredencoding()

In [None]:
#@title Initialize Tacotron and Waveglow 
%matplotlib inline
import IPython.display as ipd
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from audio_processing import griffin_lim
from text import text_to_sequence
from denoiser import Denoiser

graph_width = 900
graph_height = 360
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
    %matplotlib inline
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='upper', 
                       interpolation='none', cmap='inferno')
    fig.canvas.draw()
    plt.show()



#  the next func is a secret will be shared after the contest
def ARPA(sentence,index = 0):
    pass
 

 
 



torch.set_grad_enabled(False)

# initialize Tacotron2 with the pretrained model
hparams = create_hparams()

In [None]:
# Load Tacotron2 (run this cell every time you change the model)
hparams.sampling_rate = 22050 # Don't change this
hparams.max_decoder_steps = 1000 # How long the audio will be before it cuts off (1000 is about 11 seconds)
hparams.gate_threshold = 0.1 # Model must be 90% sure the clip is over before ending generation (the higher this number is, the more likely that the AI will keep generating until it reaches the Max Decoder Steps)
model = Tacotron2(hparams)
model.load_state_dict(torch.load(tacotron2_pretrained_model)['state_dict'])
_ = model.cuda().eval()

In [None]:
# Load WaveGlow
waveglow = torch.load(waveglow_pretrained_model)['model']
waveglow.cuda().eval()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

In [None]:
text = "בָּנַיי"
sigma = 0.8
denoise_strength = 0.1
# try to switch raw data to True maybe the results will be better
raw_input = False # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing

for i in text.split("\n"):
    if len(i) < 1: continue;
    print(i)
    if raw_input:
        if i[-1] != ";": i=i+";" 
    else: i = ARPA(i)
    print(i)
    with torch.no_grad(): # save VRAM by not including gradients
        sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
        plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],alignments.float().data.cpu().numpy()[0].T))
        audio = waveglow.infer(mel_outputs_postnet, sigma=sigma); print(""); ipd.display(ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate))