In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import re
import string

import tensorflow as tf

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir=".", cache_subdir="")


Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
dataset_dir = os.path.join(os.path.dirname(dataset),"aclImdb")
dataset_dir

'./aclImdb'

In [4]:
os.listdir(dataset_dir)

['imdb.vocab', 'train', 'README', 'test', 'imdbEr.txt']

In [5]:
train_dir = os.path.join(dataset_dir,"train")
train_dir

'./aclImdb/train'

In [6]:
os.listdir(train_dir)

['labeledBow.feat',
 'urls_unsup.txt',
 'pos',
 'urls_pos.txt',
 'unsupBow.feat',
 'unsup',
 'urls_neg.txt',
 'neg']

In [7]:
remove_dir = os.path.join(train_dir, "unsup")
shutil.rmtree(remove_dir)

In [8]:
sample_file = os.path.join(train_dir, "pos/10002_7.txt")
sample_file

'./aclImdb/train/pos/10002_7.txt'

In [9]:
with open(sample_file) as f:
    print(f.read())

This is easily the most underrated film inn the Brooks cannon. Sure, its flawed. It does not give a realistic view of homelessness (unlike, say, how Citizen Kane gave a realistic view of lounge singers, or Titanic gave a realistic view of Italians YOU IDIOTS). Many of the jokes fall flat. But still, this film is very lovable in a way many comedies are not, and to pull that off in a story about some of the most traditionally reviled members of society is truly impressive. Its not The Fisher King, but its not crap, either. My only complaint is that Brooks should have cast someone else in the lead (I love Mel as a Director and Writer, not so much as a lead).


In [10]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size = batch_size,
    validation_split = 0.2,
    subset = "training",
    seed = seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [11]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print("Review :", text_batch.numpy()[i])
        print("Label :", label_batch.numpy()[i])

Review : b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
Label : 0
Review : b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get

In [12]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size = batch_size,
    validation_split = 0.2,
    subset = "validation",
    seed = seed
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [13]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/test",
    batch_size = batch_size
)

Found 25000 files belonging to 2 classes.


In [14]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    stripped_punc = tf.strings.regex_replace(stripped_html, "[%s]" % re.escape(string.punctuation), "")
    return stripped_punc

In [15]:
for text_batch, label_batch in raw_train_ds.take(1):
    print("Review :", custom_standardization(text_batch[0]).numpy())
    break

Review : b'having seen most of ringo lams films i can say that this is his best film to date and the most unusual its a ancient china period piece cranked full of kickass martial arts where the location of an underground lair full of traps and dungeons plays as big a part as any of the characters the action is fantastic the story is tense and entertaining and the set design is truely memorable sadly burning paradise has not been made available on dvd and vhs is nexttoimpossible to get your mitts on even if you near the second biggest chinatown in north america like i do if you can find it dont pass it up'


In [16]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    standardize = custom_standardization,
    max_tokens = max_features,
    output_mode = "int",
    output_sequence_length = sequence_length
)

In [17]:
train_text = raw_train_ds.map(lambda x,y: x)
vectorize_layer.adapt(train_text)

In [18]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [19]:
text_batch, label_batch = next(iter(raw_train_ds))
print("Review:", text_batch[0])
print("Label:", label_batch[0])
print("Vectorize", vectorize_text(text_batch[0], label_batch[0]))

Review: tf.Tensor(b'Belmondo is a tough cop. He goes after a big-time drug dealer (played by Henry Silva, normally a great villain - see "Sharky\'s Machine"; but here he is clearly dubbed, and because of that he lacks his usual charisma). He goes to the scuzziest places of Paris and Marseilles, asks for some names, beats up some people, gets the names, goes to more scuzzy places, asks for more names, beats up more people, etc. The whole movie is punch after punch after punch. It seems that the people who made it had no other ambition than to create the French equivalent of "Dirty Harry". Belmondo, who was 50 here, does perform some good stunts at the beginning; apart from those, "Le Marginal" is a violent, episodic, trite, shallow and forgettable cop movie. (*1/2)', shape=(), dtype=string)
Label: tf.Tensor(0, shape=(), dtype=int32)
Vectorize (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[   1,    7,    4, 1233, 1021,   27,  261,  101,    4,    1, 1525,
        6992,  248,   3

In [20]:
print(vectorize_layer.get_vocabulary()[2])

the


In [21]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [22]:
train_ds = train_ds.cache().prefetch(buffer_size = tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size = tf.data.AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size = tf.data.AUTOTUNE)

In [23]:
embedding_dim = 16

In [24]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(max_features+1, embedding_dim),
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(optimizer="adam",
              loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics = ["accuracy"])

In [26]:
history = model.fit(train_ds,validation_data=val_ds, epochs = 15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [27]:
model.evaluate(test_ds)



[0.3115674555301666, 0.8701199889183044]

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(max_features+1, embedding_dim),
                             tf.keras.layers.LSTM(64),
                             tf.keras.layers.Dense(64, activation = "relu"),
                             tf.keras.layers.Dense(1)
])

model.summary()

model.compile(optimizer="adam",
              loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics = ["accuracy"])

history = model.fit(train_ds,validation_data=val_ds, epochs = 15)

In [29]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(max_features+1, embedding_dim),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
                             tf.keras.layers.Dense(64, activation = "relu"),
                             tf.keras.layers.Dense(1)
])

model.summary()

model.compile(optimizer="adam",
              loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics = ["accuracy"])

history = model.fit(train_ds,validation_data=val_ds, epochs = 15)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          160016    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               41472     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 209,809
Trainable params: 209,809
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
