In [111]:
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Embedding, Input, LSTM, Dense
from keras.models import Model
from keras.optimizers import Adam

In [2]:
MAX_VOCAB_SIZE = 3000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2

### Get input data

In [3]:
input_lines = []
target_lines = []

for line in open('data/robert_frost.txt'):
    temp_line = line.strip()
    
    if not temp_line:
        continue
    
    out_line = temp_line + ' <eos>'
    inp_line = '<sos> ' + temp_line
    
    input_lines.append(inp_line)
    target_lines.append(out_line)
    
len(input_lines), len(target_lines)

(1436, 1436)

### Tokenizing input data

In [4]:
all_text = input_lines + target_lines
len(all_text)

2872

In [5]:
tokenizer_ = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer_.fit_on_texts(all_text)

In [6]:
## Total number of unique words
word2ind = tokenizer_.word_index
len(word2ind)

3056

In [7]:
input_sequences = tokenizer_.texts_to_sequences(input_lines)
target_sequences = tokenizer_.texts_to_sequences(target_lines)

In [8]:
(target_lines[0], target_sequences[0])

('Two roads diverged in a yellow wood, <eos>',
 [104, 537, 538, 9, 7, 539, 540, 2])

In [9]:
NUM_WORDS = min(len(word2ind) + 1, MAX_VOCAB_SIZE)
NUM_WORDS

3000

### Padding data

In [10]:
## getting max len
MAX_SEQUENCE_LEN = max([len(s) for s in input_sequences])
MAX_SEQUENCE_LEN

12

In [11]:
padded_input_sequences = pad_sequences(input_sequences, maxlen=MAX_SEQUENCE_LEN, padding='post')
padded_target_sequences = pad_sequences(target_sequences, maxlen=MAX_SEQUENCE_LEN, padding='post')

In [12]:
padded_target_sequences[0]

array([104, 537, 538,   9,   7, 539, 540,   2,   0,   0,   0,   0],
      dtype=int32)

In [13]:
### Need to one-hot the targets because cannot use sparse-categorical-crossentropy for list of outputs
one_hot_targets = np.zeros((len(padded_target_sequences), MAX_SEQUENCE_LEN, NUM_WORDS))

for i, seq in enumerate(padded_target_sequences):
    
    for j, word_ind in enumerate(seq):
        
        one_hot_targets[i, j, word_ind] = 1

In [14]:
one_hot_targets.shape

(1436, 12, 3000)

### Prepare embeddings

In [15]:
word2vec = {}
for line in open('glove.6B/glove.6B.{0}d.txt'.format(EMBEDDING_DIM)):
    temp = line.strip().split(' ')
    word = temp[0]
    arr = np.asarray(temp[1:], dtype='float16')
    word2vec[word] = arr

print(len(word2vec))

400000


In [16]:
# word2vec['is']

In [17]:
embedding_matrix = np.zeros((NUM_WORDS, EMBEDDING_DIM))
for k, i in word2ind.items():

    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(k)
        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
print(embedding_matrix.shape)

(3000, 50)


In [18]:
embedding_matrix_2 = embedding_matrix.copy()

### Model

In [20]:
## create the embedding_layer
embedding_layer = Embedding(input_dim=NUM_WORDS, output_dim=EMBEDDING_DIM, weights=[embedding_matrix])




In [21]:
LATENT_DIM = 15

In [22]:
input_ = Input(shape=(MAX_SEQUENCE_LEN, ))
initial_h = Input(shape=(LATENT_DIM, ))
initial_c = Input(shape=(LATENT_DIM, ))




In [23]:
x = embedding_layer(input_)

lstm1 = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
x, _, _ = lstm1(x, initial_state = [initial_h, initial_c])

dense = Dense(NUM_WORDS, activation='softmax')
output = dense(x)

model = Model([input_,initial_h,initial_c], output)

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.01),
              metrics=['accuracy'])







In [26]:
z = np.zeros((len(padded_input_sequences), LATENT_DIM))

# model.fit(padded_input_sequences,
model.fit([padded_input_sequences, z, z],
          one_hot_targets,
          epochs=20,
          batch_size=5,
          validation_split=VALIDATION_SPLIT)

Train on 1148 samples, validate on 288 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x14c0632d0>

### Making the predictions

In [117]:
input_2 = Input(shape=(1, ))
x = embedding_layer(input_2)
x, h, c = lstm1(x, initial_state = [initial_h, initial_c])
output2 = dense(x)

sampling_model = Model([input_2,initial_h,initial_c], [output2, h, c])

In [109]:
ind2word = {v:k for k, v in word2ind.items()}
len(ind2word)

3056

In [118]:
def generate_poem():
    
    sos = word2ind['<sos>']
    input_p = np.array([[sos]])
    
    h = np.zeros((1, LATENT_DIM))
    c = np.zeros((1, LATENT_DIM))
    
    eos = word2ind['<eos>']
    
    generated_output = []
    
    for _ in range(MAX_SEQUENCE_LEN):
        p, h, c = sampling_model.predict([input_p, h, c])
        
        # probabilties
        probs = p[0,0]
        
        # if probabilty of <sos> is high then it prints a warning
        if np.argmax(probs) == sos:
            print('wtf')
        
        # setting probabilty of  <sos> to 0
        probs[0] = 0
        probs /= sum(probs)
        
        # randomly choosing index based on probability distribution
        # so that we get different output each time
        index = np.random.choice(len(probs), p = probs)
        
        if index == eos:
            break

        generated_output.append(ind2word.get(index, '<WTF> %s'% index))
        input_p[0][0] = index
    
    return ' '.join(generated_output)

In [129]:
print(generate_poem())

(i stole the sockets of spray;
