**// Importing necessary libraries**

In [1]:
from keras.datasets import imdb
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [2]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = 10000)

Decoding review
----

In [4]:
word_index = imdb.get_word_index()
reverse_word_index = dict(
                        [(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join(
                        [reverse_word_index.get(i-3, '?') for i in train_data[0]])

**Explanation of decoding:** 

    By default when you call imdb.load_data it will set some parameters by default if you don't define them explicitly. That means, by default, indexing starts with 3. That is why you use i-3. get_word_index will contain all the words in usuall order and the indexing starts AT ZERO. When indexing starts with 3 it just means that you leave three blank spaces _ _ _ which represent places for padding, start of sentence and unknown (indexing it 0, 1 and 2 respectively). Hence, third word in train_data[0] (i = 3) equals to first a.k.a. (i-3)th word in reverse_word_index.

### Encoding the integer sequences into a binary matrix

In [39]:
def vectorize_sequences(sequences, dimension = 10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

---

Explanation of function:
 - Create an matrix whece each row represents one sequence from train_data. 
 - Size of the row is 10000 because we are restricted to 10000 most frequent words.
 - Make the default values zero. 
 - For each such sequence on each number where the word occurs (for example 1, 14 16) put 1 
 - Do this for each sentence in the train_data and you will get one-hot-encoding.

In [48]:
results = np.zeros((len(train_data), 10000))
results[0,[2,4]]

array([0., 0.])

In [45]:
for i, sequence in enumerate(train_data):
    if i<1:
        print(i, sequence)

0 [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


---

### The model definition

In [4]:
model = models.Sequential()
model.add(layers.Dense(16, activation = 'relu', input_shape = (10000,)))
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

### Using custom losses and metrics

In [5]:
model.compile(optimizer = optimizers.RMSprop(lr = 0.01),
             loss = losses.binary_crossentropy,
             metrics = [metrics.binary_accuracy])

### Setting aside a validation set

In [6]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

### Training your model 

In [None]:
#Ovdje je on uzimo model.compile() po defaultu

In [7]:
history = model.fit(partial_x_train,
                  partial_y_train,
                  epochs = 20,
                  batch_size = 512,
                  validation_data = (x_val, y_val))

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Ploting the training and validation loss

Kad stisnes slovo H u markwodn modu dobijes nesto zanimljivo

In [8]:
history_dict = history.history
history_dict.keys()

dict_keys(['val_loss', 'val_binary_accuracy', 'loss', 'binary_accuracy'])

In [None]:
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss'] 
binary_accuracy_values = history_dict['binary_accuracy']

epochs = range(1, len(binary_accuracy_values) + 1)

plt.plot(epochs, loss_values, 'bo', label = "Training loss")
plt.plot(epochs, val_loss_values, 'b', label = "Validation loss")
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()