In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../../movie_data.csv", encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"Election is a Chinese mob movie, or triads in ...",1
1,I was just watching a Forensic Files marathon ...,0
2,Police Story is a stunning series of set piece...,1
3,"Dear Readers,<br /><br />The final battle betw...",1
4,I have seen The Perfect Son about three times....,1


In [5]:
# create a Tensorflow dataset object
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))
ds_raw

<TensorSliceDataset shapes: ((1,), ()), types: (tf.string, tf.int64)>

In [26]:
# verify
# print first 50 characters of the review, and the sentiment
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1])

b'Election is a Chinese mob movie, or triads in this' 1
b'I was just watching a Forensic Files marathon on C' 0
b'Police Story is a stunning series of set pieces fo' 1


In [18]:
# split into training, testing and validation dataset
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration = False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [19]:
# collect unique tokens
# use Counter class from the collections package
from collections import Counter

tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()

In [24]:
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
# show vocab size
print(len(token_counts))

87397


In [26]:
# use TokenTextEoncoder class to create mappings
# create an encoder object:
encoder = tfds.features.text.TokenTextEncoder(token_counts)
# see for an example text:
example_str = 'This is example, YOU!'
print(encoder.encode(example_str))

[104, 105, 24, 10193]


In [28]:
# define transformation function
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

# create another function to wrap the transformation function
# and convert it into a Tensorflow operator
def encode_map_fn(text, label):
    return tf.py_function(encode, inp = [text, label],
                         Tout = (tf.int64, tf.int64))

In [29]:
# encode the text into integers
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

In [30]:
# verify
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('Sequence length: ', example[0].shape)

Sequence length:  (248,)
Sequence length:  (181,)
Sequence length:  (243,)
Sequence length:  (801,)
Sequence length:  (662,)


In [31]:
# divide datasets into mini-batches with a batch size of 32
# generate sequence of the same length
train_data = ds_train.padded_batch(32, padded_shapes = ([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes = ([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes = ([-1], []))

### Feature Embedding

In [32]:
# create an embedding layer
from tensorflow.keras.layers import Embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim = 100,
                   output_dim = 6,
                   input_length = 20,
                   name = 'embed-layer'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, 20, 6)             600       
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [34]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

model = Sequential()
model.add(Embedding(input_dim = 1000, output_dim = 32))
model.add(SimpleRNN(32, return_sequences = True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          32000     
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


In [36]:
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)
# build embedding layer, bidirectional LSTM,
# and add ReLu and Sigmoid activation
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, 
                             output_dim = embedding_dim,
                             name = 'embed-layer'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,
                                                      name = 'lstm-layer'),
                                 name = 'bidir-lstm'),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])
bi_lstm_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1747980   
_________________________________________________________________
bidir-lstm (Bidirectional)   (None, 128)               43520     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 1,799,821
Trainable params: 1,799,821
Non-trainable params: 0
_________________________________________________________________


In [None]:
# compile and train
# will take a while!
bi_lstm_model.compile(optimizer = tf.keras.optimizers.Adam(1e-3),
                      loss = tf.keras.losses.BinaryCrossentropy(from_logits = False),
                      metrics = ['accuracy'])
history = bi_lstm_model.fit(train_data,
                           validation_data = valid_data,
                           epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [None]:
# evaluate on test data
test_results = bi_lstm_model.evaluate(test_data)
print('Test accuracy: {:.2f}%'.format(test_results[1]*100))

In [None]:
# how does it do on training data itself?
train_results = bi_lstm_model.evaluate(train_data)
print('Test accuracy: {:.2f}%'.format(train_results[1]*100))

### Using SimpleRNN, focusing only on the last tokens

In [None]:
# define a helper function to simplify preprocessing
def preproces_datasets(ds_raw_train,
                      ds_raw_valid,
                      ds_raw_test,
                      max_seq_length = None,
                      batch_size = 32):
    
    # find unique tokens
    tokenizer = tfds.features.text.Tokenizer()
    token_counts = Counter()
    
    for example in ds_raw_train:
        tokens = tokenizer.tokenizer(example[0].numpy()[0])
        if max_seq_length is not None:
            tokens = tokens[-max_seq_length:]
        token_counts.update(tokens)
    
    print('Vocab-size:', len(token_counts))
    
    # encode the text
    encoder = tfds.features.text.TokenTextEncoder(token_counts)
    def encode(text_tensor, label):
        text = text_tensor.numpy()[0]
        encoded_text = encoder.encode(text)
        if max_seq_length is not None:
            encoded_text = encoded_text[-max_seq_length:]
        return encoded_text, label
    
    def encode_map_fn(text, label):
        return tf.py_function(encode, inp = [text, label],
                             Tout = (tf.int64, tf.int64))
    
    ds_train = ds_raw_train.map(encode_map_fn)
    ds_valid = ds_raw_valid.map(encode_map_fn)
    ds_test = ds_raw_test.map(encode_map_fn)
    
    # batch datasets
    train_data = ds_train.padded_batch(batch_size,
                                      padded_shapes = ([-1],[]))
    valid_data = ds_valid.padded_batch(batch_size,
                                      padded_shapes = ([-1],[]))
    test_data = ds_test.padded_batch(batch_size,
                                      padded_shapes = ([-1],[]))
    
    return (train_data, valid_data, test_data, len(token_counts))

In [None]:
# define helper function for building models with different architectures
from tensorflow.keras.layers import (
    Embedding, Bidirectional, SimpleRNN, LSTM, GRU)

def build_rnn_model(embedding_dim, vocab_size,
                   recurrent_type = 'SimpleRNN',
                   n_recurrent_units = 64,
                   n_recurrent_layers = 1,
                   bidirectional = True):
    tf.random.set_seed(1)
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                       output_dim = embedding_dim,
                       name = 'embed-layer'))
    for i in range(n_recurrent_layers):
        return_sequences = (i < n_recurrent_layers-1)
        if recurrent_type = 'SimpleRNN':
            recurrent_layer = SimpleRNN(units = n_recurrent_units,
                                       return_sequences = return_sequences,
                                       name = 'simprnn-layer-{}'.format(i))
        elif recurrent_type == 'LSTM':
            recurrent_layer = LSTM(units = n_recurrent_units,
                                       return_sequences = return_sequences,
                                       name = 'lstm-layer-{}'.format(i))
        elif recurrent_type == 'GRU':
            recurrent_layer = GRU(units = n_recurrent_units,
                                       return_sequences = return_sequences,
                                       name = 'gru-layer-{}'.format(i))
        if bidirectional:
            recurrent_layer = Bidirectional(recurrent_layer,
                                       name = 'bidir-' + recurrent_layer.name)

        model.add(recurrent_layer)
    
    model.add(tf.keras.layers.Dense(64, activation = 'relu'))
    model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))

    return model
