In [1]:
import os
import shutil

In [2]:
import tensorflow as tf

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D

In [4]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url, untar = True, cache_dir = '.', cache_subdir = '')
dataset

'.\\aclImdb_v1.tar.gz'

In [5]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [6]:
train_dir = os.path.join(dataset_dir, 'train')
train_dir

'.\\aclImdb\\train'

In [7]:
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [8]:
remove_dir = os.path.join(train_dir, 'unsup')
remove_dir

'.\\aclImdb\\train\\unsup'

In [9]:
if False:
    shutil.rmtree(remove_dir)

In [10]:
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [11]:
BATCH_SIZE = 1024
seed = 6

In [12]:
train_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', validation_split = 0.2, subset = 'training', batch_size = BATCH_SIZE, seed = seed)
train_ds

Found 75000 files belonging to 3 classes.
Using 60000 files for training.


<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [13]:
validation_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/test', validation_split = 0.2, subset = 'validation', batch_size = BATCH_SIZE, seed = seed)
validation_ds

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [14]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch[i].numpy())
        

2 b"I saw the other user comment on this movie and just had to add my own. This movie was terrible. Seeing that Juliette Lewis and Brad Pitt were both in it I took a chance and watched the film that my sister purchased in a one dollar movie bin, and let me tell you, not worth one dollar. Maybe it's because of the fact it is a made-for-TV movie, although I have found many other made-for-TV movies much more enjoyable. The story and dialog are poorly written, and the plot is unbelievable and mediocre at best. The performances were not so bad, but were not enough to carry this film. Lewis plays a girl who gets caught up in these crazy circumstances and never seems to get things right, while Pitt plays this mean nasty character who takes advantage of this poor girl. The only good thing about this film is Lewis' t-shirts which are pretty cool. Do your self a favor and skip it."
2 b"1st watched 3/28/2004 - 4 out of 10(Dir-Keith Snyder): Unique but slow moving drama about a detective who is on

In [15]:
AUTO_TUNE = tf.data.experimental.AUTOTUNE

In [16]:
train_ds = train_ds.cache().prefetch(buffer_size = AUTO_TUNE)
validation_ds = validation_ds.cache().prefetch(buffer_size = AUTO_TUNE)

In [17]:
embedding_layer = Embedding(1000, 5)

In [18]:
result = embedding_layer(tf.constant([1, 2, 3]))
result

<tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[-0.00968832,  0.01585987,  0.03853962, -0.01252273,  0.02247994],
       [ 0.01424104,  0.03008163, -0.02072786, -0.0324785 , -0.01261304],
       [-0.02801698,  0.01311456, -0.04975003,  0.01070889, -0.04798668]],
      dtype=float32)>

In [19]:
test_string = '!@rad\
gs'

In [20]:
lower = tf.strings.lower(test_string)
lower

<tf.Tensor: shape=(), dtype=string, numpy=b'!@radgs'>

In [21]:
html_stripped = tf.strings.regex_replace(lower, '<br />', ' ')
html_stripped.numpy()

b'!@radgs'

In [22]:
import string
import re

In [23]:
final_str = tf.strings.regex_replace(html_stripped, '[%s]' % re.escape(string.punctuation), '')
final_str.numpy()

b'radgs'

In [24]:
def custom_standarization(test_string):
    
    lower = tf.strings.lower(test_string)
    html_stripped = tf.strings.regex_replace(lower, '<br />', ' ')
    final_str = tf.strings.regex_replace(html_stripped, '[%s]' % re.escape(string.punctuation), '')
    
    return final_str   

In [25]:
vocab_size = 10000
sequence_length = 100

In [26]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [27]:
vectorize_layer = TextVectorization(standardize = custom_standarization,
                                      max_tokens = vocab_size,
                                      output_mode = 'int',
                                      output_sequence_length = sequence_length
                                     )

In [28]:
vectorize_layer

<tensorflow.python.keras.layers.preprocessing.text_vectorization.TextVectorization at 0x1e244c23c70>

In [29]:
text_ds = train_ds.map(lambda x, y:x)
text_ds

<MapDataset shapes: (None,), types: tf.string>

In [30]:
vectorize_layer.adapt(text_ds)

In [31]:
embedding_dim = 16

In [38]:
model = Sequential(
[
    vectorize_layer,
    Embedding(input_dim = vocab_size, output_dim = embedding_dim),
    GlobalAveragePooling1D(),
    Dense(16, activation = 'relu'),
    Dense(1)
])

In [39]:
tensor_callback = tf.keras.callbacks.TensorBoard(log_dir = './logs')

In [40]:
model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits = True), metrics = ['accuracy'])

In [48]:
model.fit(train_ds, validation_data = validation_ds, epochs = 30, callbacks = [tensor_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1e247c74f10>

In [51]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [52]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 10364), started 0:11:57 ago. (Use '!kill 10364' to kill it.)

In [50]:
!kill 10364

kill: 10364: No such process


In [57]:
vocab = vectorize_layer.get_vocabulary()
print(type(vocab))
print(len(vocab))
print(vocab[:10])

<class 'list'>
10000
['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it']
