In [1]:
import tensorflow as tf
import numpy as np
import re

In [2]:
def get_data():
    words = []
    with open("words.txt") as fi:
        line = fi.readline()
        while(len(line)!=0):
            datapoint = line.split(' ')
            phon = datapoint[1].split('.')
            phon[-1] = phon[-1][:-1]
            words.append([datapoint[0], phon])
            line = fi.readline()
            
    return(words)

In [3]:
data = get_data()

In [4]:
data[:5]

[['aback', ['AH', 'B', 'AE', 'K']],
 ['abandon', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N']],
 ['abandoned', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'D']],
 ['abandoning', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'IH', 'NG']],
 ['abandons', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'Z']]]

In [5]:
Xraw = [k[0] for  k in data]
Yraw = [k[1] for  k in data]

In [6]:
Xraw[:5]

['aback', 'abandon', 'abandoned', 'abandoning', 'abandons']

In [30]:
Xrawlens = np.array([len(x) for x in Xraw])
word_maxlen = Xrawlens.max() + 2
word_maxlen, Xrawlens.argmax(), Xraw[Xrawlens.argmax()]

(22, 22468, 'uncharacteristically')

In [8]:
Yraw[:5]

[['AH', 'B', 'AE', 'K'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'D'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'IH', 'NG'],
 ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'Z']]

In [32]:
Yrawlens = np.array([len(y) for y in Yraw])
phonemes_maxlen = Yrawlens.max() + 2
phonemes_maxlen, Yrawlens.argmax(), Yraw[Yrawlens.argmax()], Xraw[Yrawlens.argmax()]

(18,
 4046,
 ['K',
  'AA',
  'M',
  'P',
  'AA',
  'R',
  'T',
  'M',
  'EH',
  'N',
  'T',
  'AH',
  'L',
  'AY',
  'Z',
  'D'],
 'compartmentalized')

In [11]:
def word2tensor(word, wordmaxlen=22):
    word2tensor.dummy_char_vec = np.array([0]*28)
    word = '`' + word + '{'
    wordvec_array = []
    padding = wordmaxlen - len(word)
    for eachcharacter in word:
        tempchar = word2tensor.dummy_char_vec.copy()
        tempchar[(ord(eachcharacter) - ord('`'))] = 1
        wordvec_array.append(tempchar)
        
    for i in range(padding):
        tempchar = word2tensor.dummy_char_vec.copy()
        wordvec_array.append(tempchar)
    return np.array(wordvec_array)

In [12]:
result = word2tensor("word")

In [13]:
result, result.shape

(array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

### Converting words to tensor representations and save them for fast loading

In [14]:
words = np.array([word2tensor(w) for w in Xraw])
words.shape

(24167, 22, 28)

In [15]:
original_shape = words.shape
np.savez_compressed('words2vec.npz', words = words)

In [16]:
# data = np.load('words2vec.npz')
# words = data['words']

### Phonemes: There are 39 phonemes, as shown below:
`
AA     odd     AA D        |   AE      at      AE T      
AH      hut     HH AH T     |   AO      ought   AO T     
AW      cow     K AW        |   AY      hide    HH AY D  
B       be      B IY        |   CH      cheese  CH IY Z  
D       dee     D IY        |   DH      thee    DH IY    
EH      Ed      EH D        |   ER      hurt    HH ER T  
EY      ate     EY T        |   F       fee     F IY     
G       green   G R IY N    |   HH      he      HH IY    
IH      it      IH T        |   IY      eat     IY T     
JH      gee     JH IY       |   K       key     K IY     
L       lee     L IY        |   M       me      M IY     
N       knee    N IY        |   NG      ping    P IH NG  
OW      oat     OW T        |   OY      toy     T OY     
P       pee     P IY        |   R       read    R IY D   
S       sea     S IY        |   SH      she     SH IY    
T       tea     T IY        |   TH      theta   TH EY T AH 
UH      hood    HH UH D     |   UW      two     T UW     
V       vee     V IY        |   W       we      W IY     
Y       yield   Y IY L D    |   Z       zee     Z IY     
ZH      seizure S IY ZH ER  | 
`

In [17]:
phoneme_array = [
    '\s',    'AA',    'AE',    'AH',    'AO',
    'AW',    'AY',    'B',    'CH',    'D',
    'DH',    'EH',    'ER',    'EY',    'F',
    'G',    'HH',    'IH',    'IY',    'JH',
    'K',    'L',    'M',    'N',    'NG',
    'OW',    'OY',    'P',    'R',    'S',
    'SH',    'T',    'TH',    'UH',    'UW',
    'V',    'W',    'Y',    'Z',    'ZH',  '\e'
]

In [18]:
phoneme_map = dict(zip(phoneme_array, range(len(phoneme_array))))

In [20]:
def phonemes2tensor(phonemes, phonemesmaxlen=18):
    phonemes2tensor.dummy_phoneme_vec = np.array([0]*41) # dummy phonetic symbol as stop
    phonemes = ['\s'] + phonemes + ['\e']
    phonemesvec_array = []
    padding = phonemesmaxlen - len(phonemes)
    for eachphoneme in phonemes:
        tempphoneme = phonemes2tensor.dummy_phoneme_vec.copy()
        tempphoneme[phoneme_map[eachphoneme]] = 1
        phonemesvec_array.append(tempphoneme)
        
    for i in range(padding):
        phonemesvec_array.append(phonemes2tensor.dummy_phoneme_vec.copy())
    return np.array(phonemesvec_array)

In [21]:
result = phonemes2tensor(Yraw[12])
result, result.shape, Yraw[12]

(array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      

In [22]:
phonemes = np.array([phonemes2tensor(p) for p in Yraw])
phonemes.shape

(24167, 18, 41)

In [23]:
original_shape = words.shape
np.savez_compressed('phonemes2vec.npz', phonemes = phonemes)

In [24]:
# data = np.load('words2vec.npz')
# phonemes = data['phonemes']

## Approach

- Create a 2 stacked lstm.
- Keep feeding one-hot character vectors until the word is exhausted
- Take the final hypothesis of this lstm and feed it to a Dense layer : This should be my internal representation

- Loss measure1: Dense layer to convert internal-rep to character count
- Loss measure2: Dense layer to convert internal-rep to phoneme count - parameter for next
- Loss measure3: Dense layer to convert internal-rep to phonemes output

- Loss = rms of loss1, loss2, loss3

In [25]:
indices_train = [2*i+1 for i in range(int(len(words)/2))]
indices_test = [2*i for i in range(int(len(words)/2))]

In [26]:
train_words = words[indices_train]
train_phonemes = phonemes[indices_train]

In [27]:
test_words = words[indices_test]
test_phonemes = phonemes[indices_test]

In [28]:
np.savez_compressed('train_words.npz', train_words=train_words)
np.savez_compressed('train_phonemes.npz', train_phonemes=train_phonemes)

In [29]:
np.savez_compressed('test_words.npz', test_words=test_words)
np.savez_compressed('test_phonemes.npz', test_phonemes=test_phonemes)

In [39]:
'''Sequence to sequence example in Keras (character-level).
This script demonstrates how to implement a basic character-level
sequence-to-sequence model. We apply it to translating
short English sentences into short French sentences,
character-by-character. Note that it is fairly unusual to
do character-level machine translation, as word-level
models are more common in this domain.
# Summary of the algorithm
- We start with input sequences from a domain (e.g. English sentences)
    and corresponding target sequences from another domain
    (e.g. French sentences).
- An encoder LSTM turns input sequences to 2 state vectors
    (we keep the last LSTM state and discard the outputs).
- A decoder LSTM is trained to turn the target sequences into
    the same sequence but offset by one timestep in the future,
    a training process called "teacher forcing" in this context.
    Is uses as initial state the state vectors from the encoder.
    Effectively, the decoder learns to generate `targets[t+1...]`
    given `targets[...t]`, conditioned on the input sequence.
- In inference mode, when we want to decode unknown input sequences, we:
    - Encode the input sequence into state vectors
    - Start with a target sequence of size 1
        (just the start-of-sequence character)
    - Feed the state vectors and 1-char target sequence
        to the decoder to produce predictions for the next character
    - Sample the next character using these predictions
        (we simply use argmax).
    - Append the sampled character to the target sequence
    - Repeat until we generate the end-of-sequence character or we
        hit the character limit.
'''

from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = train_words.shape[0]  # Number of samples to train on.
# Path to the data txt file on disk.

# Vectorize the data.
# input_texts = []
# target_texts = []
# input_characters = set()
# target_characters = set()
# with open(data_path, 'r', encoding='utf-8') as f:
#     lines = f.read().split('\n')
# for line in lines[: min(num_samples, len(lines) - 1)]:
#     input_text, target_text = line.split('\t')
#     # We use "tab" as the "start sequence" character
#     # for the targets, and "\n" as "end sequence" character.
#     target_text = '\t' + target_text + '\n'
#     input_texts.append(input_text)
#     target_texts.append(target_text)
#     for char in input_text:
#         if char not in input_characters:
#             input_characters.add(char)
#     for char in target_text:
#         if char not in target_characters:
#             target_characters.add(char)


num_encoder_tokens = 26 + 2 # 26 chars + 2 start stop
num_decoder_tokens = 39 + 2 # 39 phonemes + 2 start stop
max_encoder_seq_length = word_maxlen
max_decoder_seq_length = phonemes_maxlen

print('Number of samples:', train_words.shape[0])
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

# input_token_index = dict(
#     [(char, i) for i, char in enumerate(input_characters)])
# target_token_index = dict(
#     [(char, i) for i, char in enumerate(target_characters)])

# encoder_input_data = np.zeros(
#     (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
#     dtype='float32')
# decoder_input_data = np.zeros(
#     (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
#     dtype='float32')
# decoder_target_data = np.zeros(
#     (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
#     dtype='float32')

# for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
#     for t, char in enumerate(input_text):
#         encoder_input_data[i, t, input_token_index[char]] = 1.
#     for t, char in enumerate(target_text):
#         # decoder_target_data is ahead of decoder_input_data by one timestep
#         decoder_input_data[i, t, target_token_index[char]] = 1.
#         if t > 0:
#             # decoder_target_data will be ahead by one timestep
#             # and will not include the start character.
#             decoder_target_data[i, t - 1, target_token_index[char]] = 1.

encoder_input_data = train_words
decoder_input_data = train_phonemes
decoder_target_data = np.zeros(train_phonemes.shape)
decoder_target_data[:,:,:-1] = train_phonemes[:,:,1:]

Number of samples: 12083
Number of unique input tokens: 28
Number of unique output tokens: 41
Max sequence length for inputs: 22
Max sequence length for outputs: 18


In [None]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('s2s.h5')

Train on 9666 samples, validate on 2417 samples
Epoch 1/100


In [None]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
print('Decoded sentence:', decoded_sentence)