In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.compat.v2 as tf2
import tensorflow_datasets as tfds
import datetime

In [2]:
(train_data, test_data), info = tfds.load(
    'imdb_reviews/subwords8k', 
    split = (tfds.Split.TRAIN, tfds.Split.TEST), 
    with_info=True, as_supervised=True)



In [3]:
encoder = info.features['text'].encoder
train_batches = train_data.shuffle(1000).padded_batch(10)
test_batches = test_data.shuffle(1000).padded_batch(10)
encoder.subwords[:20]

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_']

In [4]:
max_input_len = encoder.vocab_size
encoded_dims = 16
inputs = layers.Input(shape=(max_input_len))
embed = layers.Embedding( encoder.vocab_size, encoded_dims)(inputs)
print(embed.shape)
avgpool = layers.GlobalAveragePooling1D()(embed) 
print(avgpool.shape)
dense1 = layers.Dense(16, activation='relu')(avgpool)
print( dense1.shape)
dense2 = layers.Dense(1)(dense1)
print( dense2.shape)


(None, 8185, 16)
(None, 16)
(None, 16)
(None, 1)


In [5]:
model = keras.Model(inputs=inputs, outputs=dense2)
print(model.summary())
model.compile( optimizer='adam', loss=keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8185)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 8185, 16)          130960    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 131,249
Trainable params: 131,249
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
log_dir = "/home/manju/code/ML/src/manju_tensorflow_examples/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + 'word_embedding_basic'
tb_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
#model.fit( train_batches, epochs = 10, validation_data=test_batches, validation_steps = 20, callbacks=[tb_callback])
model.fit( train_batches, epochs = 10, validation_data=test_batches, validation_steps = 20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc4b470e9b0>

In [10]:
test_loss, test_acc = model.evaluate(test_batches)
print (test_loss)
print (test_acc)

0.47871050238609314
0.8570799827575684


In [7]:
e = model.layers[1]
weights = e.get_weights()[0]

In [8]:
import io

encoder = info.features['text'].encoder

#out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
#out_m = io.open('meta.tsv', 'w', encoding='utf-8')

#for num, word in enumerate(encoder.subwords):
#  vec = weights[num+1] # skip 0, it's padding.
#  out_m.write(word + "\n")
#  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
#out_v.close()
#out_m.close()