In [2]:
import os
import shutil

In [3]:
import tensorflow as tf

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D

In [5]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url, untar = True, cache_dir = '.', cache_subdir = '')
dataset

'.\\aclImdb_v1.tar.gz'

In [7]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [8]:
train_dir = os.path.join(dataset_dir, 'train')
train_dir

'.\\aclImdb\\train'

In [9]:
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [10]:
remove_dir = os.path.join(train_dir, 'unsup')
remove_dir

'.\\aclImdb\\train\\unsup'

In [11]:
if False:
    shutil.rmtree(remove_dir)

In [12]:
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [13]:
BATCH_SIZE = 64
seed = 6

In [16]:
train_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', validation_split = 0.2, subset = 'training', batch_size = BATCH_SIZE, seed = seed)
train_ds

Found 75000 files belonging to 3 classes.
Using 60000 files for training.


<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [17]:
validation_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/test', validation_split = 0.2, subset = 'validation', batch_size = BATCH_SIZE, seed = seed)
validation_ds

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [20]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print(label_batch[i].numpy(), text_batch[i].numpy())
        

2 b'I watched this after seeing Lindsey Lohan in the better than average "Freaky Friday" and the brilliant "Mean Girls". my expectations were fairly high as i see Lindsey Lohan as the next big thing. But i was disappointed at this film OK, so it\'s a Disney film, so i couldn\'t expect a Mean Girls style of humour, or a Solid easy storyline, like that of Freaky Friday. but i at least expected a little more passion in the movie. The storyline was drab and very slim. there wasn\'t much to go apart from the Lola (Lindsey Lohan) characters dream of being a starlet. The plot was so weak, that absolutely anything could of happen, which isn\'t really a good thing. The acting, especially from Lindsey, was very amateurish. Linsey\'s character was so OTT that it looked as though she had ignored the director and decided to improvise her role. Alison Pill\'s role (Ella) was boring and lifeless, and i could see that she was upstaged by an over excited Lindsey. the sub characters were poorly written,

In [22]:
AUTO_TUNE = tf.data.experimental.AUTOTUNE

In [23]:
train_ds = train_ds.cache().prefetch(buffer_size = AUTO_TUNE)
validation_ds = validation_ds.cache().prefetch(buffer_size = AUTO_TUNE)

In [25]:
embedding_layer = Embedding(1000, 5)

In [26]:
result = embedding_layer(tf.constant([1, 2, 3]))
result

<tf.Tensor: shape=(3, 5), dtype=float32, numpy=
array([[ 0.02641055, -0.02836857,  0.00481981,  0.02186115, -0.03276283],
       [ 0.02527031,  0.01831198, -0.02533183,  0.00019568, -0.02313043],
       [ 0.03835136,  0.02007956,  0.00871701, -0.02855083,  0.01030844]],
      dtype=float32)>

In [29]:
test_string = '!@rad\
gs'

In [30]:
lower = tf.strings.lower(test_string)
lower

<tf.Tensor: shape=(), dtype=string, numpy=b'!@radgs'>

In [53]:
html_stripped = tf.strings.regex_replace(lower, '<br />', ' ')
html_stripped.numpy()

b'!@radgs'

In [56]:
import string
import re

In [59]:
final_str = tf.strings.regex_replace(html_stripped, '[%s]' % re.escape(string.punctuation), '')
final_str.numpy()

b'radgs'