In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../../movie_data.csv", encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"Election is a Chinese mob movie, or triads in ...",1
1,I was just watching a Forensic Files marathon ...,0
2,Police Story is a stunning series of set piece...,1
3,"Dear Readers,<br /><br />The final battle betw...",1
4,I have seen The Perfect Son about three times....,1


In [5]:
# create a Tensorflow dataset object
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))
ds_raw

<TensorSliceDataset shapes: ((1,), ()), types: (tf.string, tf.int64)>

In [26]:
# verify
# print first 50 characters of the review, and the sentiment
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1])

b'Election is a Chinese mob movie, or triads in this' 1
b'I was just watching a Forensic Files marathon on C' 0
b'Police Story is a stunning series of set pieces fo' 1


In [18]:
# split into training, testing and validation dataset
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration = False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [19]:
# collect unique tokens
# use Counter class from the collections package
from collections import Counter

tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()

In [24]:
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
# show vocab size
print(len(token_counts))

87397


In [26]:
# use TokenTextEoncoder class to create mappings
# create an encoder object:
encoder = tfds.features.text.TokenTextEncoder(token_counts)
# see for an example text:
example_str = 'This is example, YOU!'
print(encoder.encode(example_str))

[104, 105, 24, 10193]


In [28]:
# define transformation function
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

# create another function to wrap the transformation function
# and convert it into a Tensorflow operator
def encode_map_fn(text, label):
    return tf.py_function(encode, inp = [text, label],
                         Tout = (tf.int64, tf.int64))

In [29]:
# encode the text into integers
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

In [30]:
# verify
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('Sequence length: ', example[0].shape)

Sequence length:  (248,)
Sequence length:  (181,)
Sequence length:  (243,)
Sequence length:  (801,)
Sequence length:  (662,)


In [31]:
# divide datasets into mini-batches with a batch size of 32
# generate sequence of the same length
train_data = ds_train.padded_batch(32, padded_shapes = ([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes = ([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes = ([-1], []))

### Feature Embedding