In [1]:
from tensorflow.keras.datasets import reuters
import tensorflow as tf

import numpy as np

In [2]:
(train_data, train_labels), (test_data, test_labels) = reuters.load_data()
word_indexes = reuters.get_word_index()

word_indexes["OOV"] = 0
word_indexes["START"] = -1
word_indexes["PAD"] = -2
index_words = {v: k for k, v in word_indexes.items()}

In [3]:
# vocab size
vocab_size = len(word_indexes)
print(vocab_size)
print(len(train_data))

30982
8982


In [4]:
window_size = 6

windows = [
    window
    for sent in train_data
    for window in zip(*[iter(sent[i:]) for i in range(window_size)])
]

test_windows = [
    window
    for sent in test_data
    for window in zip(*[iter(sent[i:]) for i in range(window_size)])
]

print(len(windows))
print(windows[:10])

1262329
[(1, 27595, 28842, 8, 43, 10), (27595, 28842, 8, 43, 10, 447), (28842, 8, 43, 10, 447, 5), (8, 43, 10, 447, 5, 25), (43, 10, 447, 5, 25, 207), (10, 447, 5, 25, 207, 270), (447, 5, 25, 207, 270, 5), (5, 25, 207, 270, 5, 3095), (25, 207, 270, 5, 3095, 111), (207, 270, 5, 3095, 111, 16)]


In [5]:
window_train = [tuple(window[:window_size - 1]) for window in windows]
window_train_labels = [window[window_size - 1] for window in windows]

window_test = [tuple(window[:window_size - 1]) for window in test_windows]
window_test_labels = [window[window_size - 1:][0] for window in test_windows]

SHUFFLE_BUFFER_SIZE = 500
BATCH_SIZE = 32
train_dataset = tf.data.Dataset.from_tensor_slices((window_train, window_train_labels))\
                    .shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices((window_test, window_test_labels))\
                    .shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

print(window_train[0])
print(window_train_labels[0])

print(window_test[0])
print(window_test_labels[0])

(1, 27595, 28842, 8, 43)
10
(1, 4, 1378, 2025, 9)
697


In [6]:
print([index_words[i-3] for i in window_train[0]])
print(index_words[window_train_labels[0]])
print([index_words[i-3] for i in window_test[0]])
print(index_words[window_test_labels[0]])

['PAD', 'mcgrath', 'rentcorp', 'said', 'as']
for
['PAD', 'the', 'great', 'atlantic', 'and']
how


In [11]:
embedding_size = 32

input_layer = tf.keras.layers.Input(shape=(window_size-1,))
embedding = tf.keras.layers.Embedding(vocab_size + 1, embedding_size)(input_layer)
flattened = tf.keras.layers.Flatten()(embedding)
dense_connected = tf.keras.layers.Dense(10, activation='relu')(flattened)
concat = tf.keras.layers.Concatenate()([flattened, dense_connected])
prediction = tf.keras.layers.Dense(vocab_size, activation='sigmoid')(concat)

model = tf.keras.models.Model(inputs=[input_layer], outputs=[prediction])

model.compile(
    'adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none'),
    metrics=['accuracy'])

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 5)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 5, 32)        991456      input_3[0][0]                    
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 160)          0           embedding_2[0][0]                
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 10)           1610        flatten_2[0][0]                  
____________________________________________________________________________________________

In [12]:
model.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2d1c6b2e48>

In [13]:
model.evaluate(test_dataset)



[array([9.58605  , 9.581945 , 9.588014 , 9.593565 , 9.5833435, 9.587584 ,
        9.58732  , 9.587653 , 9.588189 , 9.590981 , 9.592483 , 9.58372  ,
        9.581615 , 9.579176 , 9.590049 , 9.592733 , 9.58757  , 9.581211 ,
        9.595586 , 9.593242 , 9.592864 , 9.580015 , 9.591894 , 9.583114 ,
        9.588044 , 9.586556 , 9.586377 , 9.596849 , 9.582156 , 9.585856 ,
        9.59498  , 9.588201 ], dtype=float32), 0.07950914]