# Analyzing IMDB Data in Keras

###### importing IMDB dataset

In [31]:
import keras
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.models import Sequential
from keras.layers import Dense,Activation,Dropout
from keras import regularizers

##### setting the num of words to consider top 5000 frequent words in the reviews

In [3]:
(x_train,y_train),(x_test,y_test) = imdb.load_data(path='imdb.npz',
                                                   num_words=5000,
                                                   seed=1)

In [4]:
x_train.shape

(25000,)

In [5]:
np.max(np.max(x_train))

4987

In [6]:
y_train.shape

(25000,)

#### finding the avg length of each review

In [7]:
print ('Average num of words in each review: {}'.format(np.mean([len(review) for review in x_train])))

Average num of words in each review: 238.71364


Based on this, I think creating an input layer of 5000 should be ok. Maybe can be more efficiently done with about 500 input nodes for the input layer according to the average number of words in each review?

## Transforming the Dataset

one hot encode both the input and the output

### one hot encode input

In [8]:
tokenizer=Tokenizer(num_words=5000)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
print (x_train)

[[ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 ..., 
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]]


In [9]:
print (x_train.shape)

(25000, 5000)


### one hot encode output
output label = positive/negative

In [10]:
print(y_train)

[0 1 0 ..., 1 1 1]


In [11]:
print(y_train.shape)

(25000,)


In [12]:
y_train = keras.utils.to_categorical(y_train,num_classes=2)
y_test = keras.utils.to_categorical(y_test,num_classes=2)

In [13]:
print(y_train)

[[ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 ..., 
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]]


In [14]:
print(y_train.shape)

(25000, 2)


## 4. Building the  model architecture
Build a model here using sequential. Feel free to experiment with different layers and sizes! Also, experiment adding dropout to reduce overfitting.

In [39]:
# TODO: Build the model architecture

model = Sequential()

#add hidden layer
model.add(Dense(64,input_shape=(5000,)))
model.add(Activation('relu'))
model.add(Dropout(0.4))

model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.4))

model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.3))

#add output layer
model.add(Dense(2))
model.add(Activation('softmax'))

# TODO: Compile the model using a loss function and an optimizer.
model.compile(optimizer='SGD',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

## 5. Training the model
Run the model here. Experiment with different batch_size, and number of epochs!

In [40]:
# TODO: Run the model. Feel free to experiment with different batch sizes and number of epochs.
model.fit(x_train,y_train,epochs=10,batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1171adb70>

## 6. Evaluating the model
This will give you the accuracy of the model, as evaluated on the testing set. Can you get something over 85%?

In [36]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: ", score[1])

Accuracy:  0.5
