####0. Setup

In [3]:
import nltk
import pickle
import numpy as np
import tensorflow as tf

from tensorflow import keras
from collections import Counter
from nltk.corpus import stopwords
from collections import OrderedDict

#####1. Load Data

In [5]:
import os
train_data_path = os.path.join("./", "amazon_review_polarity_csv", "train.csv")
test_data_path  = os.path.join("./", "amazon_review_polarity_csv", "test.csv")

In [20]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
## we can use pandas library to load train and test datasets, but since the datasets are quite big, a more scalable approach
## is required to load from disk as necessary. The tf.data module provides methods to extract records from one or more CSV files.

amazoon_batches = tf.data.experimental.make_csv_dataset(
    train_data_path, batch_size=32, label_name='2', num_epochs=1, prefetch_buffer_size=1000,
)

print(type(amazoon_batches))

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>


In [15]:
for feature_batch, label_batch in amazoon_batches.take(1):
    print("'sentiment': {}".format(label_batch))
    print("features:")
    for key, value in feature_batch.items():
        print("  {!r:20s}: {}".format(key, value))

'sentiment': [2 1 2 2 1 1 1 1 2 1 1 2 1 1 2 1 2 1 2 1 2 1 2 2 2 1 1 2 2 2 2 2]
features:
  'Stuning even for the non-gamer': [b'One of the best!' b'Not a Halti'
 b'Thought-provoking and at times chilling' b'Fantastic' b'Has NO value!'
 b'i have get my order' b'Nice camera, when it works'
 b"2 Unlimited is back, but not as you'd expect them to." b'Great album'
 b'Flimsy materials, potentially dangerous item'
 b"Parents - Don't Let Your Kids Watch This Movie" b"Gon' hed Musiq"
 b'Unidentified Reading Object' b'Not 3D' b'Fantastic'
 b"The Book is WAY Better..and That's all You Really Need to Know!"
 b"Good Enough I'm Looking for the Author" b'Kiddie Fodder...'
 b'She is a good singer.' b'Gardening lawn book'
 b'QUIRKY, ENERGETIC, BLAZING PULSE-POUNDER' b'My Review'
 b'scarlet letter' b'The most entertaining film of the year'
 b'Where Is The DVD?'
 b'Motorola Razr V3, V3i, V3c, Slvr L7, L6, L2 Pebl U6, V190, Mpx200, V360, V323, V325 - Ch710 OEM Original Auto Car Charger Syn084'
 b'on websi

#####2. Process Dataset

In [16]:
@tf.function
def preprocess(features_batch, labels_batch):
    features_name = list(features_batch.keys())    
    concat_texts = tf.strings.join(
        [features_batch[name] for name in features_name], separator='<sep>')
    
    concat_texts = tf.strings.substr(concat_texts, 0, 1000)
    concat_texts = tf.strings.lower(concat_texts)
    concat_texts = tf.strings.regex_replace(concat_texts, '[^a-zA-Z0-9]', ' ')
    concat_texts = tf.strings.regex_replace(concat_texts, r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*', "")
    concat_texts = tf.strings.strip(concat_texts)
    concat_texts = tf.strings.split(concat_texts)
    concat_texts = concat_texts.to_tensor(default_value="<pad>")
    
    concat_feature = OrderedDict([('concat_feature', concat_texts)])
    
    return concat_feature, labels_batch

In [21]:
vocabs = Counter()

for feature_batch, label_batch in amazoon_batches.map(preprocess).take(100000):
    for key, value in feature_batch.items():
        for review in value:
            vocabs.update(list(review.numpy()))
            
print("\nVocabs were extracted successfuly.")

Cause: for/else statement not yet supported


Cause: for/else statement not yet supported

Vocabs were extracted successfuly.


In [24]:
#with open('vocabularies_data.pkl', 'wb') as vocabs_file:
    ## we save the extracted vocabularies to disk to avoid this again later (since this costs some times)
#    pickle.dump(vocabs, vocabs_file, pickle.HIGHEST_PROTOCOL)
######################################################

with open('vocabularies_data.pkl', 'rb') as vocabs_file:
    ## load saved vocabs
    vocabs = pickle.load(vocabs_file)
    
print(vocabs.most_common()[:5])

[(b'<pad>', 159021656), (b'sep', 3200463), (b'book', 1757578), (b'one', 1230122), (b'great', 955496)]


In [25]:
## vocabs look-up table (convert each of words to an integer)

vocab_size = 20000
vocabs_short = vocabs.most_common()[:vocab_size]
vocabs_short = tf.constant([word for word, count in vocabs_short])

table_init = tf.lookup.KeyValueTensorInitializer(vocabs_short, tf.range(len(vocabs_short), dtype=tf.int64))

num_oov_buckets = 1000
body_table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets=num_oov_buckets)

In [26]:
body_table.lookup(tf.constant(['<pad>', 'good', 'terrible', 'best']))

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([  0,   5, 277,  20])>

In [27]:
# labels look-up table (using 0,1 instead of 1,2)

old_lbls = tf.constant([1, 2], dtype=tf.int64)
new_lbls = tf.constant([0, 1], dtype=tf.int64)

table_init = tf.lookup.KeyValueTensorInitializer(old_lbls, new_lbls)
num_oov_bucket = 1

lbls_table = tf.lookup.StaticVocabularyTable(table_init, num_oov_bucket)

In [28]:
@tf.function
def convert_text(features_batch, labels_batch):
    
    data_tensor = features_batch['concat_feature']   
    converted_text = OrderedDict([('concat_feature', body_table.lookup(data_tensor))])
    return converted_text, labels_batch

@tf.function
def convert_lbls(features_batch, labels_batch):
    labels_batch = tf.cast(labels_batch, tf.int64)
    return features_batch, lbls_table.lookup(labels_batch)


@tf.function
def full_preprocess(ds):
    ds_clean = ds.map(preprocess)
    ds_converted = ds_clean.map(convert_text)
    ds_preprocessed = ds_converted.map(convert_lbls)
    
    return ds_preprocessed

In [29]:
train_ds = full_preprocess(amazoon_batches)
train_ds = train_ds.shuffle(1024).prefetch(1)

In [30]:
@tf.function
def dict2tens(feature_batch, label_batch):
    return feature_batch['concat_feature'], label_batch

#####3. Process Data

In [31]:
embedding_dim = 65

model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embedding_dim),
    keras.layers.GRU(64, return_sequences=True),
    keras.layers.GRU(32),
    keras.layers.Dense(1, activation='sigmoid')
])

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 65)          1365000   
                                                                 
 gru (GRU)                   (None, None, 64)          25152     
                                                                 
 gru_1 (GRU)                 (None, 32)                9408      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 1,399,593
Trainable params: 1,399,593
Non-trainable params: 0
_________________________________________________________________


In [33]:
earlly_stopping_cb = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
checkpoint_cb = keras.callbacks.ModelCheckpoint("amazoon_review_model_chkpt")

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
history = model.fit(train_ds.map(dict2tens), epochs=5, callbacks=[earlly_stopping_cb, checkpoint_cb])

Epoch 1/5
 112495/Unknown - 1351s 12ms/step - loss: 0.2062 - accuracy: 0.9139



Epoch 2/5



Epoch 3/5



Epoch 4/5



Epoch 5/5





#####4. Evaluate the model

In [35]:
testing_batches = tf.data.experimental.make_csv_dataset(
    train_data_path, batch_size=32, label_name='2', num_epochs=1, prefetch_buffer_size=1000,
)

test_ds = full_preprocess(testing_batches)
test_ds = test_ds.shuffle(1024).prefetch(1)

model.evaluate(test_ds.map(dict2tens))



[0.13925893604755402, 0.947950541973114]

#####5. Save the trained model.

In [36]:
model.save("amazon-reviews-sentiment-analysis-classifier.h5")