# Analyzing IMDB Data in Keras

In [None]:
# Imports
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)

## 1. Loading the data
This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment.

In [2]:
# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)

(25000,)
(25000,)


## 2. Examining the data
Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.

The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative.

In [3]:
print(x_train[0])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 2, 2, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 2, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 2, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 2, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 2, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 2, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 2, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1


## 3. One-hot encoding the output
Here, we'll turn the input vectors into (0,1)-vectors. For example, if the pre-processed vector contains the number 14, then in the processed vector, the 14th entry will be 1.

In [4]:
# One-hot encoding the output into vector mode, each of length 1000
tokenizer = Tokenizer(num_words=1000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print(x_train.shape)

(25000, 1000)


And we'll also one-hot encode the output.

In [5]:
# One-hot encoding the output
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(y_train.shape)
print(y_test.shape)
print(x_train,y_train)

(25000, 2)
(25000, 2)
[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]] [[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


## 4. Building the  model architecture
Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting.

In [None]:
tests = {}

In [None]:
# TODO: 3 layers, 2 hidden, baselayer @16
model16 = Sequential()
model16.add(Dense(16, activation='relu',  input_shape=(1000,)))
model16.add(Dropout(.1))
model16.add(Dense(8, activation='relu'))
model16.add(Dropout(.1))
model16.add(Dense(4, activation='relu'))
model16.add(Dropout(.1))
model16.add(Dense(2, activation='softmax'))

# TODO: Compile the model using a loss function and an optimizer.
model16.compile(loss = 'categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model16.summary()
tests.update({'model16': model16})

In [None]:
# TODO: 3 layers, 2 hidden, baselayer @128
model128 = Sequential()
model128.add(Dense(128, activation='relu',  input_shape=(1000,)))
model128.add(Dropout(.1))
model128.add(Dense(32, activation='relu'))
model128.add(Dropout(.1))
model128.add(Dense(8, activation='relu'))
model128.add(Dropout(.1))
model128.add(Dense(2, activation='softmax'))

# TODO: Compile the model using a loss function and an optimizer.
model128.compile(loss = 'categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model128.summary()
tests.update({'model128': model128})

In [None]:
# TODO: 2 layers, 1 hidden, baselayer @128
model128_b = Sequential()
model128_b.add(Dense(128, activation='relu',  input_shape=(1000,)))
model128_b.add(Dropout(.1))
model128_b.add(Dense(16, activation='relu'))
model128_b.add(Dropout(.1))
model128_b.add(Dense(2, activation='softmax'))

# TODO: Compile the model using a loss function and an optimizer.
model128_b.compile(loss = 'categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model128_b.summary()
tests.update({'model128_b': model128_b})

In [None]:
# TODO: 2 layers, 1 hidden, baselayer @16
model16_b = Sequential()
model16_b.add(Dense(16, activation='relu',  input_shape=(1000,)))
model16_b.add(Dropout(.1))
model16_b.add(Dense(4, activation='relu'))
model16_b.add(Dropout(.1))
model16_b.add(Dense(2, activation='softmax'))

# TODO: Compile the model using a loss function and an optimizer.
model16_b.compile(loss = 'categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model16_b.summary()
tests.update({'model16_b': model16_b})

In [None]:
epochs = (1,16,32,64,128)
batchsizes = (16,32,64,128)
runstr = [(z[0], x,y,z[1])for x in epochs for y in batchsizes for z in tests.items()]
print(runstr)

## 5. Training the model
Run the model here. Experiment with different batch_size, and number of epochs!

In [None]:
# TODO: Run the model. Feel free to experiment with different batch sizes and number of epochs.
results = []
import tensorflow as tf
with tf.device('/device:GPU:0'):
    for trials in runstr:
            trials[3].fit(x_train, y_train, epochs=trials[1], batch_size=trials[2], verbose=1);
            testscore = trials[3].evaluate(x_test, y_test, verbose=0)
            trainscore = trials[3].evaluate(x_train, y_train, verbose=0)
            runtitle = str(trials[0])+'_'+str(trials[1])+'_'+str(trials[2])
            results.append((runtitle,testscore,trainscore))
            #saving for use.
            trials[3].save(runtitle)

## 6. Evaluating the model
This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?

In [8]:
from keras.models import load_model
import os

location = os.getcwd() + '\\savedmodels'
models = []
results = []

for files in os.listdir(location):
    try:
        models.append(str(files))
    except Exception as e:
        raise e
        print("No files found here!")
        
for model_ in models:
    try:
        model_loc = location + '\\' + model_
        evalmodel = load_model(model_loc)
        testscore = evalmodel.evaluate(x_test, y_test, verbose=1)[1]
        trainscore = evalmodel.evaluate(x_train, y_train, verbose=1)[1]
        runtitle = str(model_)
        results.append((runtitle,testscore,trainscore))
    except:
        results.append('N/A')
        
results






[('model128_128_128', 0.77568, 0.99976),
 ('model128_128_16', 0.79284, 0.99948),
 ('model128_128_32', 0.78716, 0.99972),
 ('model128_128_64', 0.77952, 0.99976),
 ('model128_16_128', 0.83964, 0.99996),
 ('model128_16_16', 0.8366, 0.99872),
 ('model128_16_32', 0.83788, 0.9998),
 ('model128_16_64', 0.83736, 0.99992),
 ('model128_1_128', 0.85048, 0.95844),
 ('model128_1_16', 0.8564, 0.87244),
 ('model128_1_32', 0.85588, 0.89756),
 ('model128_1_64', 0.85508, 0.92948),
 ('model128_20_128', 0.84324, 1.0),
 ('model128_20_16', 0.8388, 0.998),
 ('model128_20_32', 0.84012, 0.99976),
 ('model128_20_64', 0.84124, 0.99996),
 ('model128_32_128', 0.83632, 1.0),
 ('model128_32_16', 0.83144, 0.99796),
 ('model128_32_32', 0.83484, 0.99996),
 ('model128_32_64', 0.83504, 1.0),
 ('model128_64_128', 0.82076, 1.0),
 ('model128_64_16', 0.836, 0.99992),
 ('model128_64_32', 0.83252, 1.0),
 ('model128_64_64', 0.82692, 0.99996),
 ('model128_b_128_128', 0.82504, 1.0),
 ('model128_b_128_16', 0.83168, 1.0),
 ('model1

In [17]:
# Evaluating the model on the training and testing set
import csv
for scores in results:
    print(scores)

with open(location + '\\' +'IMDB_In_Keras-testing_epochs_layers_bSize.csv', 'r+t') as myfile:
    writer = csv.writer(myfile, lineterminator='\n')
    for rows in results:
        writer.writerow(x for x in rows)

('model128_128_128', 0.77568, 0.99976)
('model128_128_16', 0.79284, 0.99948)
('model128_128_32', 0.78716, 0.99972)
('model128_128_64', 0.77952, 0.99976)
('model128_16_128', 0.83964, 0.99996)
('model128_16_16', 0.8366, 0.99872)
('model128_16_32', 0.83788, 0.9998)
('model128_16_64', 0.83736, 0.99992)
('model128_1_128', 0.85048, 0.95844)
('model128_1_16', 0.8564, 0.87244)
('model128_1_32', 0.85588, 0.89756)
('model128_1_64', 0.85508, 0.92948)
('model128_20_128', 0.84324, 1.0)
('model128_20_16', 0.8388, 0.998)
('model128_20_32', 0.84012, 0.99976)
('model128_20_64', 0.84124, 0.99996)
('model128_32_128', 0.83632, 1.0)
('model128_32_16', 0.83144, 0.99796)
('model128_32_32', 0.83484, 0.99996)
('model128_32_64', 0.83504, 1.0)
('model128_64_128', 0.82076, 1.0)
('model128_64_16', 0.836, 0.99992)
('model128_64_32', 0.83252, 1.0)
('model128_64_64', 0.82692, 0.99996)
('model128_b_128_128', 0.82504, 1.0)
('model128_b_128_16', 0.83168, 1.0)
('model128_b_128_32', 0.82756, 1.0)
('model128_b_128_64', 0.8

#best results are for smaller epochs, as all our models like to overfit.

#>85%


* model16_b_1_32
* model128_b_1_32
* model128_b_1_64
* model16_b_1_128
* model128_1_16
* model128_b_1_16
* model128_1_32
* model128_1_64
* model16_b_1_64
* model16_1_128
* model16_1_64
* model16_b_1_16
* model16_1_32
* model128_b_1_128
* model16_1_16
* model128_1_128

In [18]:
# Next we try adjusting our dropout rates and activation functions (relu, tanh, selu, elu)