# TTS Attempt

I'm trying to develop my own *simple* and *fast* TTS engine. This is the idea behind it:

![diagram](extras/diagram.png)

## Imports

In [None]:
import json
import os

import simpleaudio as sa, wave
import time
import scipy.io.wavfile as sciwav
import scipy.signal as sgn
import numpy as np

import re
import eng_to_ipa as eti
from separasilabas import silabizer

import tensorflow as tf

## Params

In [None]:
with open('./params.json', encoding='utf-8') as f:
    PARAMS = json.load(f)

sb = silabizer()

## Load

### load audio

In [None]:
# Using simple audio, it's useful for playing, but it's more complex to get audio data
audiowave = wave.open(os.path.join(PARAMS["AUDIO_DIR"], 'common_voice_es_34926129.wav'), 'rb')
audioparams = audiowave.getparams()
print('Audio Data:\n')
print(audioparams)

audioobj = sa.WaveObject.from_wave_read(audiowave)
audiowave.close()

playobj = audioobj.play()
time.sleep(15)
playobj.stop()

In [None]:
# Using scipy, is easier to get data, but it has no playing attr, it would still use simpleaudio for playing
audiorate, audiodata = sciwav.read(os.path.join(PARAMS["AUDIO_DIR"], 'common_voice_es_34926129.wav'))

print('Audio Data:\n')
print('audiorate:', audiorate)
print('audiodata:', audiodata)
audioobj = sa.WaveObject(audiodata, 1, 2, 22050)
playobj = audioobj.play()
time.sleep(15)
playobj.stop()

We will then use:
- native [*wave*](https://docs.python.org/3/library/wave.html) to get the audio file params.
- [*simpleaudio*](https://simpleaudio.readthedocs.io/en/latest/tutorial.html) to play the audio
- scipy's [*scipy.io.wavfile*](https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.wavfile.read.html) to read the file data (won't use the audiorate returned) 

### load transcript

In [None]:
with open(os.path.join(PARAMS["TRANS_DIR"], 'common_voice_es_34926129.txt'), encoding='utf-8') as f:
    transtext = f.read().lower()

print('Transcript data:')
print('text:', transtext)
print('wordcount', len(transtext.split(' ')))
print('charcount', len(transtext))

## Processing

### process audio

In [None]:
SRC_SR = audioparams.framerate
DST_SR = PARAMS["SAMPLE_RATE"]
SRC_LENGTH = audiodata.shape[0]
SRC_SECS = SRC_LENGTH/SRC_SR
DST_LENGTH = int(SRC_SECS*DST_SR) #we don't "time crop" now to keep all data, see below  

#resample
if SRC_SR == DST_SR:
    audioproc = audiodata.copy()
else: #if src has less sr, interpolate. if src has more sr, subsample. scipy signal does both using fourier transform.
    audioproc = sgn.resample(audiodata, DST_LENGTH, domain="time").astype(np.int16) #requires astype:https://gist.github.com/alexjaw/09af24d58ac99e1e4cafba092e063fe3

#cropping
SRC_LENGTH = audioproc.shape[0]
DST_LENGTH = int(PARAMS["SAMPLE_RATE"] * PARAMS["SECONDS"]) #we "time crop" now, see above

if SRC_LENGTH == DST_LENGTH:
    pass
elif SRC_LENGTH < DST_LENGTH: #padding
    dif = DST_LENGTH - SRC_LENGTH
    difsplit = int(dif/2)
    audioproc = np.pad(audioproc, (difsplit, dif - difsplit)) #pad_width is not difsplit on both sided because of uneven difs
else: #cropping
    dif = SRC_LENGTH - DST_LENGTH
    difsplit = int(dif/2)
    audioproc = audioproc[difsplit:dif-difsplit] #crop is not difsplit on both sided because of uneven difs

print('processed audio length:', audioproc.shape[0])
print(f'processed audio secs at {DST_SR//1000} kHz:', audioproc.shape[0] / DST_SR)

In [None]:
#relisten to processed audio: it's mostly the same, pitch is a tiny bit phased
audioobj = sa.WaveObject(audioproc, 1, 2, 22050)
playobj = audioobj.play()

### process transcript

for this sample notebook i will split the sentence into three letter blocks, but this might be more efficient if separating in phonemes (english) or sílabas (spanish), using [this](https://www.geeksforgeeks.org/convert-english-text-into-the-phonetics-using-python/) and [this](https://github.com/alejandromunozes/separasilabas) respectively.

In [None]:
transproc = re.sub(r'(\.|\,|\¡|\!|\¿|\?)', r' \1 ', transtext) #separate specific punctuation
transproc = re.sub(r'\&', 'and', transproc) #replace symbols with words
transproc = re.sub(r'[^ \w\.\,\¡\!\¿\?]', r' ', transproc) #strip other symbols (replace with space and later multiple spaces are deleted)
transproc = re.sub(r' {2,}', r' ', transproc)

print('transcript processed: ', transproc)

t_feature = []
vocabulary = {'<BEG>', '<SPA>', '<END>', '<NULL>'} #beggining, end, space, and null (for vocabulary elements that don't appear on the training set)
t_feature.append('<BEG>')
for word in transproc.split(' '):
    if word == '' or word.isspace(): #even doing regex sub some words spaces or emprty strings may end up appearing.
        continue

    if word in ['.', ',', '¡', '!', '¿', '?']:
        punct = f'<{word}>'
        vocabulary.add(punct)
        t_feature.append(punct)
        continue
    
    if PARAMS["LANGUAGE"] == 'en':
        phonems = [ph[0] for ph in eti.ipa_list(word)] #ph[0] bc eti return a list with list of phonems in case some phonem has different forms of being presented.
    elif PARAMS["LANGUAGE"] == 'es':
        phonems = sb(word)
    else:
        raise ValueError(f'Language provided on PARAMS, {PARAMS["LANGUAGE"]}, has no support.')
    
    for ph in phonems:
        vocabulary.add(ph) #since it's a set, i don't have to care about checking it ph is already on it
        t_feature.append(ph)
    t_feature.append('<SPA>')
t_feature.append('<END>')

print('transcript featured:', t_feature)

## Feature Build

### audio feature

In [None]:
audio_fv = np.vstack([audioproc[np.newaxis, :, np.newaxis]])
print(audio_fv.shape)

### transcript feature

In [None]:
trans_fv = [t_feature]
N = len(vocabulary)
print('(', len(trans_fv), ',', len(trans_fv[0]), ')')
print(N)

## Autoencoder

### using tensorflow

In [None]:
inputLayer = tf.keras.layers.Input(audio_fv.shape[1:])

kernel_size = PARAMS["SAMPLE_RATE"]//2
last_layer = inputLayer
layer_number = 1

while N < audio_fv.shape[1] // (2**layer_number):
    layer_number += 1

corrected_N = audio_fv.shape[1] // (2**(layer_number-1)) +1 #HERE: dont know if this is the best way, or to change the MaxPooling so that the shape ends in N

for i in range(2, layer_number):                                            #HERE: Replace this with an array generated on above while
    hiddenLayer = tf.keras.layers.Conv1D(int(corrected_N*i/layer_number), last_layer.get_shape()[1]//2, padding="same", strides=1)(last_layer)
    hiddenLayer = tf.keras.layers.MaxPool1D(2, padding="same")(hiddenLayer)
    last_layer = hiddenLayer
    print(last_layer.get_shape())

hiddenLayer = tf.keras.layers.Conv1D(corrected_N, last_layer.get_shape()[1]//2, padding="same",  strides=1)(last_layer)
midLayer = tf.keras.layers.MaxPool1D(2, padding="same")(hiddenLayer)
last_layer = midLayer
print(last_layer.get_shape())

for i in range(layer_number-1, 1, -1):                                              #HERE: Replace this with an array generated on above while
    hiddenLayer = tf.keras.layers.Conv1DTranspose(int(corrected_N*i/layer_number), last_layer.get_shape()[1]//2, padding="same", strides=2)(last_layer)
    last_layer = hiddenLayer
    print(last_layer.get_shape())

outputLayer = tf.keras.layers.Conv1DTranspose(1, last_layer.get_shape()[1]//2, padding="same", strides=2)(hiddenLayer)
last_layer = outputLayer
print(last_layer.get_shape())

model = tf.keras.Model(inputs = inputLayer, outputs = outputLayer)
model.compile()

### using pytorch