In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from keras.datasets import imdb
from keras import layers
from keras import models
from keras import optimizers
from keras import losses
from keras import metrics

## num words means we will only keep the top 10000 most frequently occuring words in the training data. Rare words will be discarded.This will allow us to work with vector data of manageable size

In [None]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = 10000)

In [None]:
test_labels

## Since we have resrticted ourself to the top 10000 most frequent words, no word index will exceed 10000

In [None]:
max([max(sequences) for sequences in train_data])

## Decoding the reviews back from integers

In [None]:
word_index = imdb.get_word_index()

In [None]:
word_index

In [None]:
# reversed_word_index= dict([(value, key) for (key, value) in word_index.items()])
reversed_word_index = {value:key for key, value in word_index.items()}

In [None]:
reversed_word_index

In [None]:
# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_review = ' '.join([reversed_word_index.get(i-3, '?') for i in train_data[0]])

In [None]:
decoded_review

## Preparing the data

### We cannot feed lists of integers into a neural network. We have to turn our lists into Tensor. We have two ways of doing that:

**1.** We could pad our lists so that they all have the same length, and turn them into an integer tensor of shape (samples, word_indices), then use as first layer in our network a layer capable of handling such integer tensors (the Embedding layer, which we will cover in detail later in the book).

**2.** We could one-hot-encode our lists to turn them into vectors of 0s and 1s. Concretely, this would mean for instance turning the sequence [3, 5] into a 10,000-dimensional vector that would be all-zeros except for indices 3 and 5, which would be ones. Then we could use as first layer in our network a Dense layer, capable of handling floating point vector data.
We will go with the latter solution. 

Let us vectorize our data, which we will do manually for maximum clarity:

## Encoding the train data integers sequences into a binary matrix ( One Hot Encoding)

In [None]:
def vectorize_sequences(sequences, dimension=10000):
    # Create a all zero matrix of shape : ((len(sequences( number of samples/data point, dimesions)))
    results = np.zeros((len(sequences), dimension))
    # Iterate over the sequence and set specific indices of results[i] to 1
    for index, sequence in enumerate(sequences):
        results[index, sequence] = 1
        
    return results

In [None]:
# vectrorize train and test dataset
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [None]:
x_train.shape

In [None]:
x_train[0]

In [None]:
x_train

## Vectorizing our labels and converting to float 32

In [None]:
## vectorize labels
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [None]:
train_labels

In [None]:
y_train

# Architecture of the Model

## There are two key architecture decisions to be made about such stack of dense layers:

How many layers to use.

How many "hidden units" to chose for each layer.

In the next chapter, you will learn formal principles to guide you in making these choices

For the time being, you will have to trust us with the following architecture choice: two intermediate layers with 16 hidden units each, and a third layer which will output the scalar prediction regarding the sentiment of the current review. The intermediate layers will use relu as their "activation function", and the final layer will use a sigmoid activation so as to output a probability (a score between 0 and 1, indicating how likely the sample is to have the target "1", i.e. how likely the review is to be positive). A relu (rectified linear unit) is a function meant to zero-out negative values, while a sigmoid "squashes" arbitrary values into the [0, 1] interval, thus outputting something that can be interpreted as a probability.

## Creating Model Defination

In [None]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Compiling the model

## Lastly, we need to pick a loss function and an optimizer. 

Since we are facing a binary classification problem and the output of our network is a probability (we end our network with a single-unit layer with a sigmoid activation), is it best to use the binary_crossentropy loss. It isn't the only viable choice: you could use, for instance, mean_squared_error. 

But crossentropy is usually the best choice when you are dealing with models that output probabilities. Crossentropy is a quantity from the field of Information Theory, that measures the "distance" between probability distributions, or in our case, between the ground-truth distribution and our predictions.

In [None]:
model.compile(optimizer="rmsprop",
             loss="binary_crossentropy",
             metrics=["accuracy"])

### Option 1 - Configuring the optimizer

In [None]:
# model.compile(optimizer=optimizers.RMSprop(lr=0.01),
#              loss='binary_crossentropy',
#              metrics=['accuracy'])

### Option 2 - Using Custom losses and metrics

In [None]:
# model.compile(optimizer=optimizers.RMSprop(lr=0.01),
#              loss=losses.binary_crossentropy,
#              metrics=[metrics.binary_accuracy])

# Creating Validation Set

In order to monitor during training the accuracy of the model on data that it has never seen before, we will create a "validation set" by setting apart 10,000 samples from the original training data:

In [None]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]

In [None]:
y_val.shape

# Training Model

In [None]:
history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))

In [None]:
from numba import cuda

cuda.select_device(0)
cuda.close()

In [None]:
history_dict = history.history

In [None]:
history_dict.keys()

In [None]:
def plot_loss(history, epochs, plt):
    history_dict = history.history
    loss_values = history_dict.get('loss')
    val_loss_values = history_dict.get('val_loss')
    
    epochs_to_plot = range(1, epochs+1)
    plt.plot(epochs_to_plot, loss_values, 'bo', label='Training Loss')
    plt.plot(epochs_to_plot, val_loss_values, 'b', label='Validation Loss')
    plt.title("Training and Validation Loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

In [None]:
plot_loss(history, 20, plt)

In [None]:
def plot_accuracy(history, epochs, plt):
    history_dict = history.history
    acc_values = history_dict.get('accuracy')
    val_acc_values = history_dict.get('val_accuracy')
    
    epochs_to_plot = range(1, epochs+1)
    plt.plot(epochs_to_plot, acc_values, 'bo', label='Training Accuracy')
    plt.plot(epochs_to_plot, val_acc_values, 'b', label='Validation Accuracy')
    plt.title("Training and Validation Accuracy")
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

In [None]:
plot_accuracy(history, 20, plt)

## As we see here that wew are overfitting, let us train the model to 4 epoch and calculate final result on test data set

In [None]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer="rmsprop",
             loss="binary_crossentropy",
             metrics=["accuracy"])

## Note here we are taking the complete trainig example

history = model.fit(x_train, y_train, epochs=4, batch_size=512)

## Get result

In [None]:
results = model.evaluate(x_test, y_test)

In [None]:
results

In [None]:
## 87 % Accuracy

## Generate Predictions on new data

In [None]:
model.predict(x_test)