In [2]:
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd 
import tensorflow as tf

In [3]:
with  open('/Users/chess1812/Documents/GitHub/EPAM_Tasks/Task4(grapheme-to-phoneme)/cmudict.txt') as file:
    lines = []
    for line in file:
        lines.append(line.split())

print(' amount of data', len(lines))

 amount of data 135154


In [4]:
print("some examples:")
for example in lines[0:5]:
    print(example)

some examples:
["'bout", 'B', 'AW1', 'T']
["'cause", 'K', 'AH0', 'Z']
["'course", 'K', 'AO1', 'R', 'S']
["'cuse", 'K', 'Y', 'UW1', 'Z']
["'em", 'AH0', 'M']


In [5]:
graphemes = [ list(line[0]) for line in lines]
phonemes = [ line[1::] for line in lines]

In [6]:
from sklearn.model_selection import train_test_split
texts_train, texts_test, phoneme_train, phoneme_test = train_test_split(graphemes, phonemes, test_size=0.1, random_state=42)

In [7]:
from keras.preprocessing.text import Tokenizer

letter_tokenizer = Tokenizer()             
letter_tokenizer.fit_on_texts(texts_train)         

X_train = letter_tokenizer.texts_to_sequences(texts_train)
X_test = letter_tokenizer.texts_to_sequences(texts_test)

# look at first encoded data point
print("initial  data  example: \n", texts_train[0])
print(" Encoded data  example: \n", X_train[0])

initial  data  example: 
 ['s', 'e', 't', 't', 'l', 'e', 'm', 'e', 'n', 't']
 Encoded data  example: 
 [5, 1, 8, 8, 9, 1, 12, 1, 6, 8]


Add "start" and "end"  symbol

In [8]:
def tagger(decoder_input):
    bos = "<BOS> "
    eos = " <EOS>"
    final_target = decoder_input.copy()
    final_target.append(eos)
    final_target.insert(0, bos)
    return final_target

In [9]:
phoneme_train = [tagger(phoneme) for phoneme in phoneme_train ]
phoneme_test = [tagger(phoneme) for phoneme in phoneme_test ]

In [10]:
phoneme_tokenizer = Tokenizer()             
phoneme_tokenizer.fit_on_texts(phoneme_train)         

y_train = phoneme_tokenizer.texts_to_sequences(phoneme_train)
y_test = phoneme_tokenizer.texts_to_sequences(phoneme_test)

In [11]:
# look at first encoded data point
print("initial  data  example: \n", phoneme_train[0])
print(" Encoded data  example: \n", y_train[0])

initial  data  example: 
 ['<BOS> ', 'S', 'EH1', 'T', 'AH0', 'L', 'M', 'AH0', 'N', 'T', ' <EOS>']
 Encoded data  example: 
 [1, 5, 17, 7, 3, 6, 12, 3, 4, 7, 2]


In [12]:
VOCAB_INPUT_SIZE = len(letter_tokenizer.word_counts) + 1
VOCAB_OUTPUT_SIZE = len(phoneme_tokenizer.word_counts) + 1
VOCAB_SIZE = max(len(letter_tokenizer.word_counts), len(phoneme_tokenizer.word_counts)) + 1
print('number of different characters:', VOCAB_INPUT_SIZE)
print('number of different phonems:', VOCAB_OUTPUT_SIZE)

number of different characters: 35
number of different phonems: 87


In [13]:
INPUT_MAX_LEN = max(len(x) for x in X_train)
OUTPUT_MAX_LEN = max(len(y) for y in y_train)
MAX_LEN =  max(INPUT_MAX_LEN, OUTPUT_MAX_LEN)
print('input max len:', INPUT_MAX_LEN )
print('output max len:', OUTPUT_MAX_LEN)


input max len: 28
output max len: 30


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train, maxlen=INPUT_MAX_LEN, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test, maxlen=INPUT_MAX_LEN, padding="post", truncating="post")

In [15]:
y_train_pad = pad_sequences(y_train, maxlen=OUTPUT_MAX_LEN, padding="post", truncating="post")
y_test_pad = pad_sequences(y_test, maxlen=OUTPUT_MAX_LEN, padding="post", truncating="post")

In [16]:
from tensorflow.keras import layers, Sequential, Input, Model
from tensorflow.keras.layers import LSTM, TimeDistributed, Dense, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint

EMBEDDING_INPUT_SIZE = VOCAB_SIZE
EMBEDDING_OUTPUT_SIZE = VOCAB_SIZE

def seq2seq_model_builder(HIDDEN_DIM=300):
    
    encoder_inputs = Input(shape=(INPUT_MAX_LEN, ), dtype='int32',)
    input_embed_layer =  Embedding(input_dim = VOCAB_INPUT_SIZE,  
                            output_dim = EMBEDDING_INPUT_SIZE, 
                            input_length = INPUT_MAX_LEN)
    
    encoder_embedding = input_embed_layer(encoder_inputs)
    encoder_LSTM = LSTM(HIDDEN_DIM, return_state=True)
    encoder_outputs, state_h, state_c = encoder_LSTM(encoder_embedding)
    print(state_h.shape)
    
    
    decoder_inputs = Input(shape=(OUTPUT_MAX_LEN, ), dtype='int32',)
    output_embed_layer =  Embedding(input_dim = VOCAB_OUTPUT_SIZE,
                            output_dim = EMBEDDING_OUTPUT_SIZE, 
                            input_length = OUTPUT_MAX_LEN)
    
    decoder_embedding = output_embed_layer(decoder_inputs)
    

    decoder_LSTM = LSTM(HIDDEN_DIM, return_state=True, return_sequences=True)
    decoder_outputs, _, _ = decoder_LSTM(decoder_embedding,) #initial_state=[state_h, state_c])
    
    print(decoder_embedding.shape)
    # dense_layer = Dense(VOCAB_SIZE, activation='softmax')
    outputs = TimeDistributed(Dense(VOCAB_OUTPUT_SIZE, activation='softmax'))(decoder_outputs)
    model = Model([encoder_inputs, decoder_inputs], outputs)
    
    return model

In [17]:
model = seq2seq_model_builder(HIDDEN_DIM=16)

model.compile(optimizer='adam', 
              loss='crossentropy', 
              metrics=['accuracy'])
model.summary()

(None, 16)
(None, 30, 87)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 87)       7569        input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 30, 16), (No 6656        embedding_1[0][0]                
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 28)]         0                                            
____________________________________________________________________

In [18]:
def decoder_output_creater(decoder_input_data, num_samples, max_len, vocab_size):
  
    decoder_output_data = np.zeros((num_samples, max_len, vocab_size), dtype="float32")

    for i, seqs in enumerate(decoder_input_data):
        for j, seq in enumerate(seqs):
            if j > 0:
                decoder_output_data[i][j][seq] = 1
  
    return decoder_output_data

train_data = decoder_output_creater(X_train_pad, len(X_train_pad), INPUT_MAX_LEN, VOCAB_INPUT_SIZE )
test_data = decoder_output_creater(X_test_pad, len(X_test_pad), INPUT_MAX_LEN, VOCAB_INPUT_SIZE)

y_train_data = decoder_output_creater(y_train_pad, len(y_train_pad), OUTPUT_MAX_LEN, VOCAB_OUTPUT_SIZE )
y_test_data = decoder_output_creater(y_train_pad, len(y_train_pad), OUTPUT_MAX_LEN, VOCAB_OUTPUT_SIZE )
print(train_data.shape)

(121638, 28, 35)


In [19]:
#model(decoder_output_data)
model.fit(np.array(train_data), np.array(y_train_pad),epochs=5,
                              batch_size=128, validation_data = [np.array(X_test_pad), np.array(y_test_data)])

Epoch 1/5


AssertionError: in user code:

    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/network.py:719 call
        convert_kwargs_to_constants=base_layer_utils.call_context().saving)
    /Applications/anaconda3/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/engine/network.py:899 _run_internal_graph
        assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)

    AssertionError: Could not compute output Tensor("time_distributed/Identity:0", shape=(None, 30, 87), dtype=float32)


In [20]:
np.array(X_train_pad).shape

(121638, 28)