<a href="https://colab.research.google.com/github/mattjcamil/deepvoice3_pytorch/blob/master/Maltese_TTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Maltese TTS Demo Based on Deep Voice 3


## Setup

### Install dependencies

In [None]:
import os
from os.path import exists, join, expanduser

# Clone
name = "deepvoice3_pytorch"
if not exists(name):
  ! git clone https://github.com/mattjcamil/$name

In [None]:
%tensorflow_version 1.14

In [None]:
%cd deepvoice3_pytorch/

In [None]:
# Git checkout with comit version

!git checkout 707c95e55847dcb20879d5f8402ada9c6df25eba --quiet


In [None]:
# Install dependencices
!pip install -q -e ".[bin]"

In [None]:
%pylab inline
! pip install -q librosa nltk

import torch
import numpy as np
import librosa
import librosa.display
import IPython
from IPython.display import Audio

import nltk
! python -m nltk.downloader cmudict

### Download a pre-trained model

In [None]:
#change the checkpoint path to change the model
preset = "deepvoice3_ljspeech.json"

checkpoint_path = "checkpoint_step000750000_Exp13.pth"


In [None]:
if not exists(preset):
  !curl -O -L "https://www.dropbox.com/s/od6tmutsyd5ylj5/deepvoice3_ljspeech.json"
if not exists(checkpoint_path):
  !curl -O -L "https://www.dropbox.com/s/x0v2djnkfriewph/checkpoint_step000750000_Exp13.pth"

## Synthesis

### Setup hyper parameters

In [None]:
import hparams
import json

    
# Load parameters from preset
with open(preset) as f:
  hparams.hparams.parse_json(f.read())
  
# Inject frontend text processor
import synthesis
import train
from deepvoice3_pytorch import frontend
synthesis._frontend = getattr(frontend, "en")
train._frontend =  getattr(frontend, "en")

# alises
fs = hparams.hparams.sample_rate
hop_length = hparams.hparams.hop_size

### Define utility functions

In [None]:
def tts(model, text, p=0, speaker_id=None, fast=True, figures=True):
  from synthesis import tts as _tts
  waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
  if figures:
      visualize(alignment, spectrogram)
  IPython.display.display(Audio(waveform, rate=fs))
  
def visualize(alignment, spectrogram):
  label_fontsize = 16
  figure(figsize=(16,16))

  subplot(2,1,1)
  imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
  xlabel("Decoder timestamp", fontsize=label_fontsize)
  ylabel("Encoder timestamp", fontsize=label_fontsize)
  colorbar()

  subplot(2,1,2)
  librosa.display.specshow(spectrogram.T, sr=fs, 
                           hop_length=hop_length, x_axis="time", y_axis="linear")
  xlabel("Time", fontsize=label_fontsize)
  ylabel("Hz", fontsize=label_fontsize)
  tight_layout()
  colorbar()

### Load the model checkpoint

In [None]:
from train import build_model
from train import restore_parts, load_checkpoint

model = build_model()
model = load_checkpoint(checkpoint_path, model, None, True)

### Preprocess Text

In [None]:
from g2p_cw_rules import g2p_cw_rules
import re

In [None]:
# Enter the Sentence/word in Maltese into the string 'texts'
text = "Dik il-ħabta l-uġigħ ta' rasijiet kienu fl-aqwa tagħhom"

In [None]:
# convert text to phonemes
text = g2p_cw_rules(text)

# fix kh instances from g2p tool
text = (re.sub('kh','',re.sub('kh ','h ',text)))

# make sure the sentence has sufficent length for attention mechanisim
text.ljust(30,'.')    

# make sure a sentence ends in full stop
if(text[-1:] != '.'):
    text = text + '.'

# padd 'x' sounds with spaces. Tend to produce better pronounciations more often than not.    
text = re.sub('ʃ',' ʃ ',text)

### Generate speech

In [None]:
tts(model, text, figures=False)