In [1]:
import sys,os
import io
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding

Using TensorFlow backend.


In [0]:
!pip install -q pyyaml h5py 

In [0]:
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping

In [4]:
corpus = "en_US_3.txt"
with io.open(corpus, encoding='utf-8') as f:
    text = f.read().lower().replace('\n', ' \n ')
print('Corpus length in characters:', len(text))

text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print('Corpus length in words:', len(text_in_words))

Corpus length in characters: 4229397
Corpus length in words: 818620


In [0]:
MIN_WORD_FREQUENCY = 2

In [6]:
# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before ignoring: 86222
Ignoring words with frequency < 2
Unique words after ignoring: 27195


In [0]:
SEQUENCE_LEN = 10

In [8]:
# cut the text in semi-redundant sequences of SEQUENCE_LEN words
STEP = 1
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored+1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

Ignored sequences: 437310
Remaining sequences: 381300


In [0]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return x_train, y_train,x_test, y_test

In [10]:
sentences, next_words, sentences_test, next_words_test = shuffle_and_split_training_set(sentences, next_words)

Shuffling sentences
Size of training set = 373674
Size of test set = 7626


In [0]:
def define_model(dropout = 0.3):
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(SEQUENCE_LEN, len(words))))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    print(len(words))
    model.add(Activation('softmax'))
    return model

In [0]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, word_indices[w]] = 1
            y[i, word_indices[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y

In [0]:
examples = "examples.txt"
examples_file = open(examples, "w+")

In [0]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, SEQUENCE_LEN, len(words)))
            for t, word in enumerate(sentence):
                x_pred[0, t, word_indices[word]] = 1

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [0]:
file_path = "/content/checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{categorical_accuracy:.4f}-val_loss{val_loss:.4f}-val_acc{val_categorical_accuracy:.4f}" % (
    len(words),
    SEQUENCE_LEN,
    MIN_WORD_FREQUENCY
)
checkpoint_dir = os.path.dirname(file_path)
checkpoint = ModelCheckpoint(file_path, monitor='categorical_accuracy', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='categorical_accuracy', patience=4)
callbacks_list = [checkpoint, print_callback, early_stopping]

In [17]:
model = define_model()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
27195


In [0]:
BATCH_SIZE = 500

In [19]:
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['categorical_accuracy'])





In [20]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 256)               27979776  
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 27195)             6989115   
_________________________________________________________________
activation_1 (Activation)    (None, 27195)             0         
Total params: 34,968,891
Trainable params: 34,968,891
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
    epochs=20,
    callbacks=callbacks_list,
    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/20





Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f715df7ec18>

In [22]:
!ls {checkpoint_dir}

LSTM_LYRICS-epoch001-words27195-sequence10-minfreq2-loss7.1826-acc0.0679-val_loss7.0450-val_acc0.0711
LSTM_LYRICS-epoch002-words27195-sequence10-minfreq2-loss6.8263-acc0.0810-val_loss6.6899-val_acc0.1119
LSTM_LYRICS-epoch003-words27195-sequence10-minfreq2-loss6.3676-acc0.1308-val_loss6.4339-val_acc0.1385
LSTM_LYRICS-epoch004-words27195-sequence10-minfreq2-loss6.0412-acc0.1518-val_loss6.3334-val_acc0.1475
LSTM_LYRICS-epoch005-words27195-sequence10-minfreq2-loss5.7663-acc0.1674-val_loss6.3246-val_acc0.1484
LSTM_LYRICS-epoch006-words27195-sequence10-minfreq2-loss5.4892-acc0.1824-val_loss6.3670-val_acc0.1455
LSTM_LYRICS-epoch007-words27195-sequence10-minfreq2-loss5.2116-acc0.1979-val_loss6.5029-val_acc0.1408
LSTM_LYRICS-epoch008-words27195-sequence10-minfreq2-loss4.9408-acc0.2142-val_loss6.6048-val_acc0.1369
LSTM_LYRICS-epoch009-words27195-sequence10-minfreq2-loss4.6822-acc0.2325-val_loss6.7198-val_acc0.1338
LSTM_LYRICS-epoch010-words27195-sequence10-minfreq2-loss4.4299-acc0.2532-val_loss6

In [0]:
model.load_weights("/content/checkpoints/LSTM_LYRICS-epoch005-words27195-sequence10-minfreq2-loss5.7663-acc0.1674-val_loss6.3246-val_acc0.1484")

In [24]:
loss,acc = model.evaluate(generator(sentences_test,  next_words_test,BATCH_SIZE), verbose=2,steps=int(len(sentences_test)/BATCH_SIZE) + 1)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Restored model, accuracy: 14.88%


In [25]:
words_number = 50 # number of words to generate
seed_sentences = "Desk put together, room all set up. Oh boy, oh" #seed sentence to start the generating.
seed_sentences = seed_sentences.lower()
#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (SEQUENCE_LEN):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[SEQUENCE_LEN-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, SEQUENCE_LEN, len(words)))
    for t, word in enumerate(sentence):
        x[0, t, word_indices[word]] = 1.

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.33)
    next_word = indices_word[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

#print the whole text
print(generated)

desk put together, room all set up. oh boy, oh i don't love it 
 i can't wait to see you at the best of the world 
 i love to see you at the world but i think i was like a great service in the rest of the world and the best day is in the way to
