In [3]:
import matplotlib.pyplot as plt
import shutil
import re
import os
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing

In [4]:
print(tf.__version__)

2.6.0


In [5]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file('aclImdb_v1'
                                  , url, untar=True, cache_subdir='', cache_dir='.')
#saves path of dataset in "dataset" (return value)

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [6]:
print(os.path.dirname(dataset))
print(dataset_dir)

.
./aclImdb


In [7]:
os.listdir(dataset_dir)

['.DS_Store', 'imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

In [8]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['urls_unsup.txt',
 '.DS_Store',
 'neg',
 'urls_pos.txt',
 'unsup',
 'urls_neg.txt',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat']

In [9]:
sample_file = os.path.join(train_dir, 'neg/321_1.txt')
print(open(sample_file).read())

Shecky, is a god damned legend, make no mistake. Until recently I worked for a UK HiFi & Video retail chain, running their testing department. We would go through many new starters, they would be expected to to learn how to fault find the various detritus that returns as non functional in one way or another from the stores. Now to tortu^^^^^ test the resolve of these new staff members, we would issue them with a copy of Going Overboard. We had hundreds of copies of this film because whenever someone who had bought a particular model of Goodmans DVD player that had this film as a free gift, got round to sending their DVD player back, they never failed to send Shecky back also. Our new staff would be forced to use only Going Overboard to test these machines for faults until they had found a disc or two of their own to test with.<br /><br />Now, as to why this film is so bad, where do I begin?<br /><br />Adam Sandler, who can be so, so very funny, as in Happy Gilmore, or the Wedding Singe

In [10]:
#remove folders that aren't used for training the model
delete_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(delete_dir)

In [11]:
batch_size = 32
seed = 42

#creating dataset (before, take 20 percent of data for validation set)
train_ds_raw = preprocessing.text_dataset_from_directory(
                'aclImdb/train',
                batch_size=batch_size,
                validation_split=0.2,
                subset='training',
                seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [12]:
val_ds_raw = preprocessing.text_dataset_from_directory(
                'aclImdb/train',
                batch_size=batch_size,
                validation_split=0.2,
                subset='validation',
                seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [13]:
test_ds_raw = preprocessing.text_dataset_from_directory(
                'aclImdb/test',
                batch_size=batch_size)

Found 25000 files belonging to 2 classes.


In [14]:
def data_standardization(input):
    lowercase = tf.strings.lower(input)
    remove_html = tf.strings.regex_replace(lowercase, '<br />', '')
    return tf.strings.regex_replace(remove_html, '[%s]' % re.escape(string.punctuation), '')

In [15]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(max_tokens=max_features,
                                          standardize=data_standardization,
                                          output_mode='int',
                                          output_sequence_length=sequence_length)

In [16]:
#dataset only with text: remove lables
train_ds_text = train_ds_raw.map(lambda x, y: x)
vectorize_layer.adapt(train_ds_text)

In [17]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [18]:
train_ds = train_ds_raw.map(vectorize_text)
val_ds = val_ds_raw.map(vectorize_text)
test_ds = test_ds_raw.map(vectorize_text)

In [19]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [20]:
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, 16),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
             optimizer='adam',
             metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
epochs = 10

history = model.fit(train_ds,
                   validation_data=val_ds,
                   epochs=epochs)

Epoch 1/10


In [None]:
loss, accuracy = model.evalutate(test_ds)

print("Loss:", loss)
print("Accuracy:", accuracy)