<a href="https://colab.research.google.com/github/mrtzcardo/IMBD-Review-Binary-Classification/blob/main/IMBD_Review_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''IMDB dataset: a set of 50,000 highly polarized reviews from the
Internet Movie Database.
Each set consisting of 50% negative and 50% positive reviews.
The reviews (sequences of words) have been turned into
sequences of integers, where each integer stands for a specific word in a dictionary.'''

In [None]:
from keras.datasets import imdb
from keras import models
from keras import layers
import numpy as np
import matplotlib.pyplot as plt

Pad  lists so that they all have the same length, turn them into an integer
tensor of shape (samples, word_indices), and then use as the first layer in
your network a layer capable of handling such integer tensors (the Embedding
layer).

One-hot encode  lists to turn them into vectors of 0s and 1s. This would
mean, for instance, turning the sequence [3, 5] into a 10,000-dimensional vector
that would be all 0s except for indices 3 and 5, which would be 1s. Then it could be used as the first layer in the network a Dense layer, capable of handling floating-point vector data.

In [None]:
def vectorize_sequences(sequences, dimension=10000):
  results = np.zeros((len(sequences), dimension))  #Creates an all-zero matrix of shape (len(sequences),dimension)
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1.   #Sets specific indices of results[i] to 1s
  return results

In [None]:

'''The argument num_words=10000 means only keeping the top 10,000 most frequently
occurring words in the training data. Rare words will be discarded. This allows
working with vector data of manageable size.'''

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words=10000)

In [None]:
print(train_data[0])
print(train_labels[0])

In [None]:
'''Restricting to the top 10,000 most frequent words, no word
index will exceed 10,000:'''

max([max(sequence) for sequence in train_data])

In [None]:
'''For kicks, let's quickly decode one of these reviews back to English
words:'''

word_index = imdb.get_word_index()
reverse_word_index = dict(
[(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join(
[reverse_word_index.get(i - 3, '?') for i in train_data[0]])

In [None]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
print(x_train[0])

In [None]:
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
print(y_train[0])

In [None]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='rmsprop',
                        loss='mse', #loss='binary_crossentropy'
                        metrics=['acc'])

In [None]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

In [None]:
history_dict = history.history
print(history_dict.keys())
print(history_dict.items())

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, 21)               #epochs = range(1, len(acc) + 1)
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.clf()
acc_values = history_dict['acc']   #history_dict['accuracy']
val_acc_values = history_dict['val_acc']   #history_dict['val_accuracy']
plt.plot(epochs, acc_values, 'bo', label='Training acc')  
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
results = model.evaluate(x_test, y_test)

In [None]:
print(results)

In [None]:
model.predict(x_test)