<a href="https://colab.research.google.com/github/mariomeissner/nlp_class/blob/master/keras_text_classification_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import keras
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [0]:
imdb = keras.datasets.imdb

# Get data, considering only the 10000 most frequent words
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
train_labels = np.expand_dims(train_labels, axis=1)
test_labels = np.expand_dims(test_labels, axis=1)

In [0]:
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))

In [0]:
# What does a review look like?
print(train_data[0])

In [0]:
# Reviews have different lengths
len(train_data[0]), len(train_data[1])

In [0]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
  # The first 3 indices are reserved characters
  return ' '.join([reverse_word_index.get(i - 3, '?') for i in text])

In [0]:
# Decode a review
decode_review(train_data[0])

In [0]:
# TODO: Convert the sequences into 10.000 sized vectors, with 1's and 0's.

encoded_train = None
encoded_test = None

In [0]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = 10000

# TODO: Build a keras model
model = None

model.summary()

In [0]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [0]:
# TODO: Fit the model: 
# Use the last 5000 train samples for validation
# Run 7 epochs, with a batch size of 256.

history = None

In [0]:
results = model.evaluate(encoded_test, test_labels)
print(results)

In [0]:
history_dict = history.history
history_dict.keys()

In [0]:
import matplotlib.pyplot as plt

acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [0]:
plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()