In [3]:
import tensorflow as tf
import numpy as np
import re

In [4]:
def get_data():
    words = []
    with open("words.txt") as fi:
        line = fi.readline()
        while(len(line)!=0):
            datapoint = line.split(' ')
            phon = datapoint[1].split('.')
            phon[-1] = phon[-1][:-1]
            words.append([datapoint[0], phon])
            line = fi.readline()
            
    return(words)

In [5]:
data = get_data()

In [6]:
data[:5]

[['aback', ['AH', 'B', 'AE', 'K']],
 ['abandon', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N']],
 ['abandoned', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'D']],
 ['abandoning', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'IH', 'NG']],
 ['abandons', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'Z']]]

In [7]:
Xraw = [k[0] for  k in data]
Yraw = [k[1] for  k in data]

In [8]:
Xraw[:5]

['aback', 'abandon', 'abandoned', 'abandoning', 'abandons']

In [9]:
Xrawlens = np.array([len(x) for x in Xraw])
Xrawlens.max(), Xrawlens.argmax(), Xraw[Xrawlens.argmax()]

(20, 22468, 'uncharacteristically')

In [10]:
Yraw[:5]

[['AH', 'B', 'AE', 'K'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'D'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'IH', 'NG'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'Z']]

In [11]:
Yrawlens = np.array([len(y) for y in Yraw])
Yrawlens.max(), Yrawlens.argmax(), Yraw[Yrawlens.argmax()], Xraw[Yrawlens.argmax()]

(16,
 4046,
 ['K',
  'AA',
  'M',
  'P',
  'AA',
  'R',
  'T',
  'M',
  'EH',
  'N',
  'T',
  'AH',
  'L',
  'AY',
  'Z',
  'D'],
 'compartmentalized')

In [16]:
def word2tensor(word, wordmaxlen=24):
    word2tensor.dummy_char_vec = np.array([0]*26)
    wordvec_array = []

    for eachcharacter in word:
        tempchar = word2tensor.dummy_char_vec.copy()
        tempchar[ord(eachcharacter) - ord('a')] = 1
        wordvec_array.append(tempchar)
        
    return np.array(wordvec_array)

In [17]:
# def word2tensor(word, wordmaxlen=24):
#     # print(word)
#     word = '`' + word + '{'
#     wordvec_array = []
#     padding = wordmaxlen - len(word)
#     for eachcharacter in word:
#         tempchar = (ord(eachcharacter) - ord('`'))/27.0
#         wordvec_array.append(tempchar)
        
#     for i in range(padding):
#         wordvec_array.append((ord('{') - ord('`'))/27.0)
#     return np.array(wordvec_array)

In [18]:
result = word2tensor("word")

In [19]:
result, result.shape

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]), (4, 26))

### Converting words to tensor representations and save them for fast loading

In [20]:
words = np.array([word2tensor(w) for w in Xraw])
words.shape

(24167,)

In [21]:
original_shape = words.shape
np.savez_compressed('words2vec.npz', words = words)

In [22]:
# data = np.load('words2vec.npz')
# words = data['words']

### Phonemes: There are 39 phonemes, as shown below:
`
AA     odd     AA D        |   AE      at      AE T      
AH      hut     HH AH T     |   AO      ought   AO T     
AW      cow     K AW        |   AY      hide    HH AY D  
B       be      B IY        |   CH      cheese  CH IY Z  
D       dee     D IY        |   DH      thee    DH IY    
EH      Ed      EH D        |   ER      hurt    HH ER T  
EY      ate     EY T        |   F       fee     F IY     
G       green   G R IY N    |   HH      he      HH IY    
IH      it      IH T        |   IY      eat     IY T     
JH      gee     JH IY       |   K       key     K IY     
L       lee     L IY        |   M       me      M IY     
N       knee    N IY        |   NG      ping    P IH NG  
OW      oat     OW T        |   OY      toy     T OY     
P       pee     P IY        |   R       read    R IY D   
S       sea     S IY        |   SH      she     SH IY    
T       tea     T IY        |   TH      theta   TH EY T AH 
UH      hood    HH UH D     |   UW      two     T UW     
V       vee     V IY        |   W       we      W IY     
Y       yield   Y IY L D    |   Z       zee     Z IY     
ZH      seizure S IY ZH ER  | 
`

In [24]:
phoneme_array = [
    'AA',
    'AE',
    'AH',
    'AO',
    'AW',
    'AY',
    'B',
    'CH',
    'D',
    'DH',
    'EH',
    'ER',
    'EY',
    'F',
    'G',
    'HH',
    'IH',
    'IY',
    'JH',
    'K',
    'L',
    'M',
    'N',
    'NG',
    'OW',
    'OY',
    'P',
    'R',
    'S',
    'SH',
    'T',
    'TH',
    'UH',
    'UW',
    'V',
    'W',
    'Y',
    'Z',
    'ZH'
]

In [32]:
phoneme_map = dict(zip(phoneme_array, range(len(phoneme_array))))
phoneme_map

{'AA': 0,
 'AE': 1,
 'AH': 2,
 'AO': 3,
 'AW': 4,
 'AY': 5,
 'B': 6,
 'CH': 7,
 'D': 8,
 'DH': 9,
 'EH': 10,
 'ER': 11,
 'EY': 12,
 'F': 13,
 'G': 14,
 'HH': 15,
 'IH': 16,
 'IY': 17,
 'JH': 18,
 'K': 19,
 'L': 20,
 'M': 21,
 'N': 22,
 'NG': 23,
 'OW': 24,
 'OY': 25,
 'P': 26,
 'R': 27,
 'S': 28,
 'SH': 29,
 'T': 30,
 'TH': 31,
 'UH': 32,
 'UW': 33,
 'V': 34,
 'W': 35,
 'Y': 36,
 'Z': 37,
 'ZH': 38}

In [33]:
def phonemes2tensor(phonemes, phonemesmaxlen=20):
    phonemes2tensor.dummy_phoneme_vec = np.array([0]*39) # dummy phonetic symbol as stop
    phonemesvec_array = []

    for eachphoneme in phonemes:
        tempphoneme = phonemes2tensor.dummy_phoneme_vec.copy()
        tempphoneme[phoneme_map[eachphoneme]] = 1
        phonemesvec_array.append(tempphoneme)
        
    return np.array(phonemesvec_array)

In [34]:
# def phonemes2tensor(phonemes, phonemesmaxlen=20):
#     # print(word)
#     phonemes = ['\s'] + phonemes + ['\e']
#     phonemesvec_array = []
#     padding = phonemesmaxlen - len(phonemes)
#     for eachphoneme in phonemes:
#         tempphoneme = phoneme_map[eachphoneme]
#         phonemesvec_array.append(tempphoneme/40.0)
        
#     for i in range(padding):
#         phonemesvec_array.append(1.0)
#     return np.array(phonemesvec_array)

In [36]:
result = phonemes2tensor(Yraw[12])
result, result.shape, Yraw[12]

(array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [37]:
phonemes = np.array([phonemes2tensor(p) for p in Yraw])
phonemes.shape

(24167,)

In [38]:
original_shape = words.shape
np.savez_compressed('phonemes2vec.npz', phonemes = words)

In [23]:
# data = np.load('words2vec.npz')
# phonemes = data['phonemes']

## Approach

- Create a 2 stacked lstm.
- Keep feeding one-hot character vectors until the word is exhausted
- Take the final hypothesis of this lstm and feed it to a Dense layer : This should be my internal representation

- Loss measure1: Dense layer to convert internal-rep to character count
- Loss measure2: Dense layer to convert internal-rep to phoneme count - parameter for next
- Loss measure3: Dense layer to convert internal-rep to phonemes output

- Loss = rms of loss1, loss2, loss3

In [46]:
indices_train = [2*i+1 for i in range(int(len(words)/2))]
indices_test = [2*i for i in range(int(len(words)/2))]

In [47]:
train_words = words[indices_train]
train_phonemes = phonemes[indices_train]

In [48]:
test_words = words[indices_test]
test_phonemes = phonemes[indices_test]

In [49]:
np.savez_compressed('train_words.npz', train_words=train_words)
np.savez_compressed('train_phonemes.npz', train_phonemes=train_phonemes)

In [50]:
np.savez_compressed('test_words.npz', test_words=test_words)
np.savez_compressed('test_phonemes.npz', test_phonemes=test_phonemes)

In [87]:
x = tf.keras.Input(shape=(26,1), batch_size=1, name="input", dtype=tf.float32)

In [97]:
lstm1 = tf.keras.layers.LSTM(64, return_sequences=True)

In [98]:
H = lstm1(x)

In [100]:
_d1 = tf.keras.layers.Dense(64, activation='relu')(H)
d1 = tf.keras.layers.Dense(1, activation='relu')(_d1)

In [101]:
_d2 = tf.keras.layers.Dense(64, activation='relu')(H)
d2 = tf.keras.layers.Dense(1, activation='relu')(_d2)

In [None]:
def losses(y_pred, y):
    return tf.add((y_pred - y)**2)

#------------ Lets train LSTM to predict word lenth and phoneme lengths properly

In [112]:
wl = [tw.shape[0] for tw in train_words]
pl = [ph.shape[0] for ph in train_phonemes]

In [124]:
y = np.array([list(k) for k in zip(wl, pl)])

In [125]:
y

array([[ 7,  7],
       [10,  9],
       [ 5,  4],
       ...,
       [ 6,  5],
       [ 4,  3],
       [ 7,  5]])

In [126]:
import keras
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Input, LSTM, RepeatVector
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.optimizers import SGD, RMSprop, Adam
from keras import objectives


def create_lstm_vae(input_dim, 
    timesteps, 
    batch_size, 
    intermediate_dim, 
    latent_dim,
    epsilon_std=1.):

    """
    Creates an LSTM Variational Autoencoder (VAE). Returns VAE, Encoder, Generator. 

    # Arguments
        input_dim: int.
        timesteps: int, input timestep dimension.
        batch_size: int.
        intermediate_dim: int, output shape of LSTM. 
        latent_dim: int, latent z-layer shape. 
        epsilon_std: float, z-layer sigma.


    # References
        - [Building Autoencoders in Keras](https://blog.keras.io/building-autoencoders-in-keras.html)
        - [Generating sentences from a continuous space](https://arxiv.org/abs/1511.06349)
    """
    x = Input(shape=(timesteps, input_dim,))

    # LSTM encoding
    h = LSTM(intermediate_dim)(x)

    # VAE Z layer
    z_mean = Dense(latent_dim)(h)
    z_log_sigma = Dense(latent_dim)(h)
    
    def sampling(args):
        z_mean, z_log_sigma = args
        epsilon = K.random_normal(shape=(batch_size, latent_dim),
                                  mean=0., stddev=epsilon_std)
        return z_mean + z_log_sigma * epsilon

    # note that "output_shape" isn't necessary with the TensorFlow backend
    # so you could write `Lambda(sampling)([z_mean, z_log_sigma])`
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_sigma])
    
    # decoded LSTM layer
    decoder_h = LSTM(intermediate_dim, return_sequences=True)
    decoder_mean = LSTM(input_dim, return_sequences=True)

    h_decoded = RepeatVector(timesteps)(z)
    h_decoded = decoder_h(h_decoded)

    # decoded layer
    x_decoded_mean = decoder_mean(h_decoded)
    
    # end-to-end autoencoder
    vae = Model(x, x_decoded_mean)

    # encoder, from inputs to latent space
    encoder = Model(x, z_mean)

    # generator, from latent space to reconstructed inputs
    decoder_input = Input(shape=(latent_dim,))

    _h_decoded = RepeatVector(timesteps)(decoder_input)
    _h_decoded = decoder_h(_h_decoded)

    _x_decoded_mean = decoder_mean(_h_decoded)
    generator = Model(decoder_input, _x_decoded_mean)
    
    def vae_loss(x, x_decoded_mean):
        xent_loss = objectives.mse(x, x_decoded_mean)
        kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
        loss = xent_loss + kl_loss
        return loss

    vae.compile(optimizer='rmsprop', loss=vae_loss)
    
    return vae, encoder, generator



Using TensorFlow backend.


In [136]:
vae, enc, gen = create_lstm_vae(26, 
    timesteps=1, 
    batch_size=1, 
    intermediate_dim=32,
    latent_dim=100,
epsilon_std=1.)

In [137]:
for i in range(train_words.shape[0]):
    vae.fit(x=train_words[i], y=train_phonemes[i], epochs=2, batch_size=train_words[i].shape[0])

ValueError: Error when checking input: expected input_7 to have 3 dimensions, but got array with shape (7, 26)