<a href="https://colab.research.google.com/github/philmorrison/resources/blob/master/IMDB_Reviews_Sentiment_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import urllib

import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras import preprocessing
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, RNN, GRU, LSTM, Bidirectional 

In [None]:
(train_ds, test_ds), info = tfds.load(name="imdb_reviews",     # imdb_reviews
                 with_info=True,
                 split=['train', 'test'],  # which sets to get
                 as_supervised=True)  # ask for supervised tuples

In [None]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [None]:
print(type(train_ds))
print(type(test_ds))

<class 'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter'>
<class 'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter'>


In [None]:
texts = []
labels = []

test_texts = []
test_labels = []

for text, label in train_ds.take(25000):
  # print(str(text.numpy())[1:])
  # print(label.numpy())
  review = str(text.numpy())[1:]
  y = label.numpy()
  texts.append(review)
  labels.append(y)

for text, label in test_ds.take(10000):
  # print(str(text.numpy())[1:])
  # print(label.numpy())
  review = str(text.numpy())[1:]
  y = label.numpy()
  test_texts.append(review)
  test_labels.append(y)


In [None]:
print(len(texts))
print(len(labels))

25000
25000


In [None]:
texts[0]

'"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [None]:
## Create Word Index and Reverse Index
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = 10000, oov_token = '<UNK>')  # takes into account top 10000 words only
tokenizer.fit_on_texts(texts)          # build the word index
word2index = tokenizer.word_index        # recover the word index
index2word = {value: key for key, value in word2index.items()  } # create reverse index

print(word2index)
print(index2word)
print(len(word2index))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
## Fit tokenizer to documents
train_sequences = tokenizer.texts_to_sequences(texts) # tokenize sentences into their index numbers
train_sequences[0]

test_sequences = tokenizer.texts_to_sequences(test_texts) 

In [None]:
## Use padding to ensure all sequences are the same length
from tensorflow.keras.preprocessing import sequence

maxlen = 300           # cut off for num tokens in each example
x_train = sequence.pad_sequences(train_sequences,
                                 padding='post', # add padding to the end, 'pre' for before
                                 truncating='post', # remove values from sequences larger than maxlen, either at the beginning 'pre' or at the end 'post' of the sequences
                                 maxlen=maxlen) # turn lists of integers into tensor of shape(samples, maxlen)
x_test = sequence.pad_sequences(test_sequences, maxlen=maxlen) # turn lists of integers into tensor of shape(samples, maxlen)
print(x_train.shape)
print(x_test.shape)
print(x_train[0])

(25000, 300)
(10000, 300)
[  12   14   35  438  399   18  173   29    1    9   33 1376 3399   42
  496    1  196   25   87  155   19   12  210  339   29   69  247  212
    9  486   61   69   87  115   98   24 5741   12 3315  658  776   12
   18    7   35  405 8229  177 2476  425    2   91 1251  139   71  148
   55    2    1 7526   71  228   69 2960   16    1 2880    1    1 1505
 4997    3   40 3949  118 1606   17 3399   14  162   19    4 1251  926
 7988    9    4   18   13   14 4199    5  101  147 1235   11  239  693
   13   45   25  100   39   12 7235    1   39 1376    1   52  408   11
   98 1212  873  144   10    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0 

In [None]:
labels = np.array(labels)
test_labels = np.array(test_labels)

In [None]:
labels

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
model = Sequential()
model.add(Embedding(10000, 64, input_length = maxlen))
model.add(GRU(64, recurrent_dropout=0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 64)           640000    
_________________________________________________________________
gru (GRU)                    (None, 64)                24960     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 667,073
Trainable params: 667,073
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

model.fit(x_train, labels,
          epochs=30,
          batch_size=128,
          verbose=1,
          validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fca7d940e10>

In [None]:
model.evaluate(x_test, test_labels)



[0.9705485105514526, 0.7814000248908997]