## Download and pre-process data

In [None]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

In [None]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

## Data Loading

In [2]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [3]:
batch_size = 1024
seed = 12345
train_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [4]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0])
    break

1
b"The first von Trier movie i've ever seen was breaking the waves. Sure a nice movie but it definitely stands in the shadow of europa. Europa tells a story of a young German-American who wants to experience Germany just after the second world war. He takes a job that his uncle has arranged for him as a purser on a luxues train. Because of his job, he travels all through an almost totally destroyed germany, meeting with the killing of traitors, and hunt for former nazi party members. The society is suffering from corruption. His uncle has narrowed his conciousness by focussing on the job he has also as a purser on the train. By coincidence the main character get involved in bombing and terrorism by a group called 'werewolves' they put pressure on him to help them placing bombs on trains. The atmosphere is astounding. The viewer is taken from scene to scene by a man attempting to put the viewer under hypnosis and then counting to wake you up in a new scene. Just when you think you've s

In [5]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0].decode('ascii'))
    break

0
I read the comment of Chris_m_grant from United States.<br /><br />He wrote : " A Fantastic documentary of 1924. This early 20th century geography of today's Iraq was powerful."<br /><br />I would like to thank Chris and people who are interested in Bakhtiari Nomads of Iran, the Zagros mountains and landscapes and have watched the movie Grass, A Nation's battle for life. These traditions you saw in the movie have endured for centuries and will go on as long as life endures. I am from this region of Iran myself. I am a Bakhtiari. <br /><br />Chris, I am sorry to bother you but Bakhtiari region of Zardkuh is in Iran not in Irak as you mentioned in your comment. Iran and Irak are two different and distinct countries. Taking an Iranian for an Irankian is almost like taking an American for an Mexican. Thanks,<br /><br />Ziba


## Vectorization

In [6]:
vocal_size   = 20000
sequence_len = 200

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

vectorization = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocal_size,
    output_mode="int",
    output_sequence_length=sequence_len,
)

vectorization.adapt(train_ds.map(lambda text, label: text))

In [13]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorization(text), label

train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)

In [14]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0])
    break

1
[  280    43   457  1037   107    31     2   201    58    31     2   201
   268     9    13    37   270    15    11   647    12    10    67  4890
    42     2     1     5     2   134     3   199     9  1823   368    46
   113    18     9     7    21     9     7    30   146     2    62     7
   160   156    15     2    80     5     2  4434  2751   326  3058     9
     1   708    20     2  8875     5     2    80    32  4899  1714     2
 18710     2  4535     1     5     2    80    32  4149  1829  1072    20
     2  1837   505     6     2   212    12     2  1242  1588  6270    41
    54     2     1     1    32     2   166  1059  4751    36  1207    99
   519     6 11394     2  4822    82   574     3  1597 16682    65 18015
     4  7910  1902     8     2    19  4679    11    77    12    34   456
     6    66     2  1873  6245    20     2  1837    30     5   128  6705
    25    75   249     8     1  2837   862     8     2   751  3175   450
    29   484    36    43  2377   447 18015     3 

In [15]:
for text_batch, label_batch in val_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0])
    break

0
[    2  1087     5  1028     5    11    17    43  4825     1   393    20
     9    18    53    43     2   164  4194    14  2174  5284   116     8
  1976   175     2    84   149    12   159    11    17   811    34    24
   256     6  2487     4    17    16   280   175     8     9    15   673
   225     5   261    71    25     6    25    11    17   786   180  3472
  1188   329    12    24  7747    12   221     4   459   165   183     6
   413     9    30   126  4678 12586     5   842  1836    17  2226     7
     2   328     5    11  1227   417    19    29     4    49   598   827
    15  4678     6    76    46   279   398     4  2229    17     8  3953
    37    53   478    76 14866     8     2  1503   337    94    11   987
    17     7    42    46   987   197    17    12    13   109  1689    81
    11   987  1731  1110   304   175     8     9   450     2    17    64
   406     6   472   843     2   459   205    62   352   362   254   110
   461  4678    53  3045     2     1   120    53 

In [None]:
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)