In [None]:
%config IPCompleter.greedy=True

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
import numpy as np

imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']

traning_sentences = []
training_labels = []

testing_sequences = []
testing_labels = []

for s,l in train_data:
  traning_sentences.append(str(s.numpy()))
  training_labels.append(l.numpy())

for s,l in test_data:
  testing_sequences.append(str(s.numpy()))
  testing_labels.append(l.numpy())

training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [None]:
testing_sequences[1]

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_token = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(traning_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(traning_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(testing_sequences)
test_padded = pad_sequences(test_sequences, maxlen=max_length)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
# model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(6, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded, training_labels_final, epochs=4, validation_data=(test_padded, testing_labels_final))

In [None]:
import io

weights = model.layers[0].get_weights()[0]
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
    
out_v.close()
out_m.close()

In [None]:
def convert_text_to_padded_sequences(text):
    sequences = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
    return padded

test_data1 = convert_text_to_padded_sequences('This movie is so boring')
predict1 = model.predict(test_data1)
test_data2 = convert_text_to_padded_sequences('The film is wonderful')
predict2 = model.predict(test_data2)
print(predict1)
print(predict2)