In [43]:
import re
import os
from collections import Counter

import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_datasets as tfds


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## In this notebook I train simple bidirectional LSTM network to perform sentiment analysis on Amazon reviews dataset using TensorFlow. 

### Load positive and negative reviews from files and concatenate them into a single string.

In [44]:
pfad = '/content/drive/My Drive/colab/datasets/amazon2014'
forbid = 'unlabeled'

In [45]:
text = ''
for i in os.listdir(pfad):
  tmpdir = os.path.join(pfad,i)
  print(i)
  #if i == 'books':
  for j in os.listdir(tmpdir):
    print(j)
    if forbid not in j:
      tmpfile = os.path.join(tmpdir,j)
      print(tmpfile)
      with open(tmpfile) as file:
        body = file.read()
      print(len(body))
      text = text + body

dvd
negative.review
/content/drive/My Drive/colab/datasets/amazon2014/dvd/negative.review
1588827
positive.review
/content/drive/My Drive/colab/datasets/amazon2014/dvd/positive.review
1695767
unlabeled.review
books
negative.review
/content/drive/My Drive/colab/datasets/amazon2014/books/negative.review
1538070
positive.review
/content/drive/My Drive/colab/datasets/amazon2014/books/positive.review
1510847
kitchen_&_housewares
negative.review
/content/drive/My Drive/colab/datasets/amazon2014/kitchen_&_housewares/negative.review
1019913
unlabeled.review
positive.review
/content/drive/My Drive/colab/datasets/amazon2014/kitchen_&_housewares/positive.review
1018947
electronics
positive.review
/content/drive/My Drive/colab/datasets/amazon2014/electronics/positive.review
1104976
negative.review
/content/drive/My Drive/colab/datasets/amazon2014/electronics/negative.review
1113470
unlabeled.review


In [46]:
len(text)

10590817

Sample unprocessed data piece

In [54]:
text[:1600]

'<review>\n<unique_id>\nB00064LJVE:one_of_the_worst_movies_i_have_ever_seen.:fer360\n</unique_id>\n<asin>\nB00064LJVE\n</asin>\n<product_name>\nThe Village (Widescreen Edition) (Vista Series): DVD: Jayne Atkinson,Adrien Brody,Frank Collison,Jesse Eisenberg,Brendan Gleeson,Judy Greer,Charlie Hofheimer,Bryce Dallas Howard,William Hurt,Cherry Jones,John Christopher Jones,Fran Kranz,Joaquin Phoenix,Michael Pitt (II),Pascale Renate Smith,Scott Sowers,Zack Wall,Sigourney Weaver,Celia Weston\n</product_name>\n<product_type>\ndvd\n</product_type>\n<helpful>\n0 of 4\n</helpful>\n<rating>\n1.0\n</rating>\n<title>\nOne of the worst movies I have ever seen.\n</title>\n<date>\nOctober 29, 2006\n</date>\n<reviewer>\nFer360\n</reviewer>\n<reviewer_location>\nNew England, USA\n</reviewer_location>\n<review_text>\nThis entire movie could have run in only 20 minutes and you wouldn\'t miss anything and might even enjoy it. Unfortunately it ran 88 minutes too long and I couldn\'t wait for it to end.  I sa

# Get all tag names

Reviews are separated by `<review>` tags. Other tags demarcate different parameters of the review entry. We want data in DataFrame format for convenient analysis. For that let's first split the unprocessed data into a list of unprocessed reviews and then extract values of all parameters which we can further use to create a DataFrame.

In [55]:
b = re.findall('<([\w]*)>',text, flags=re.DOTALL)
tag_names_small = list(set(b) - {'review'})
tag_names = list(set(b))
tag_names

['helpful',
 'reviewer_location',
 'review_text',
 'product_name',
 'review',
 'product_type',
 'unique_id',
 'title',
 'date',
 'rating',
 'reviewer',
 'asin']

In [56]:
tags_start = list(map(lambda x : '<'+x+'>', tag_names_small))
tags_end = list(map(lambda x : '</'+x+'>', tag_names_small))
merged = list(zip(tags_start, tags_end))
merged

[('<helpful>', '</helpful>'),
 ('<reviewer_location>', '</reviewer_location>'),
 ('<review_text>', '</review_text>'),
 ('<product_name>', '</product_name>'),
 ('<product_type>', '</product_type>'),
 ('<unique_id>', '</unique_id>'),
 ('<title>', '</title>'),
 ('<date>', '</date>'),
 ('<rating>', '</rating>'),
 ('<reviewer>', '</reviewer>'),
 ('<asin>', '</asin>')]

In [57]:
unproccessed_entries = re.findall('<review>\\n(.*?)\\n</review>\\n', text, flags=re.DOTALL)

Sample unprocessed review.

In [58]:
unproccessed_entries[-1000]

'<unique_id>\nB00005UKBG:bad:j._brodeur_"disgusted_consumer"\n</unique_id>\n<asin>\nB00005UKBG\n</asin>\n<product_name>\nAtlantic 1316 CD Storage Case (110-Capacity, Wave): Electronics\n</product_name>\n<product_type>\nelectronics\n</product_type>\n<helpful>\n15 of 16\n</helpful>\n<rating>\n2.0\n</rating>\n<title>\nbad\n</title>\n<date>\nMay 4, 2005\n</date>\n<reviewer>\nJ. Brodeur "disgusted consumer"\n</reviewer>\n<reviewer_location>\n\n</reviewer_location>\n<review_text>\ncons\ntips extremely easy on carpet and if you have a lot of cds stacked at the top\n\npoorly designed, it is a vertical cd rack that doesnt have individual slots for cds, so if you want a cd from the bottom of a stack you have basically pull the whole stack to get to it\n\nputting it together was a pain, the one i bought i had to break a piece of metal just to fit it in its guide holes.\n\nagain..poorly designed... doesnt even fit cds that well, there are gaps, and the cd casses are loose fitting\n\npros\n........

# Create DataFrame from dict

In [59]:
# given the tags extract corresponding values from each entry
def col_entries(tag1, tag2, t):
    return re.findall(tag1+'\\n(.*?)\\n'+tag2, text, flags=re.DOTALL)

In [61]:
# create dictionary
d = {k:col_entries(tag_start, tag_end, text) for k, (tag_start, tag_end)  in zip(tag_names_small,merged)}
for i in tag_names_small:
  print(i, ': ',d[i][1])

helpful :  2 of 25
reviewer_location :  Acidville, CA
review_text :  If you are looking for a good movie to buy for your child, pass on this one. This movie has so many drug references, i can't even begin to explain.(trust me, I just so happen to have taken acid before) This is a movie that NEVER should have been directed toward children. 
  
   If you want your child to be drug free when he/she grows up, do not buy this
product_name :  Alice in Wonderland (Masterpiece Edition): DVD: Kathryn Beaumont,Ed Wynn,Richard Haydn,Sterling Holloway,Jerry Colonna,Verna Felton,J. Pat O'Malley,Bill Thompson,Heather Angel,Joseph Kearns,Larry Grey,Queenie Leonard,Dink Trout,Doris Lloyd,James MacDonald (II),Bill Lee (IV),Thurl Ravenscroft,Max Smith,Bob Hamlin,Don Barclay,Wilfred Jackson,Clyde Geronimi,Hamilton Luske
product_type :  dvd
unique_id :  B0000TG9E2:another_classic,_ruined_by_disney:poopear_"i_eat_$h!7"
title :  Another classic, ruined by Disney
date :  July 1, 2006
rating :  1.0
reviewer :

In [None]:
df = pd.DataFrame(d)

Now let's process our data further by removing exceedingly long reviews (longer than 4000 characters) and dropping irrelevant columns. Let's also cast all words in all reviews to lowercase.

In [None]:
df = df.drop(index=df[df.review_text.str.len()>4000].index,
             columns=['unique_id', 'date', 'reviewer', 'product_name', 'reviewer_location', 'helpful', 'asin'])
df.rename(columns={'review_text':'text'}, inplace=True)
df.rating = df.rating.astype(float)
df.text = df.text.str.lower()
df = df.sample(df.shape[0])

In [None]:
df.head()

Unnamed: 0,text,product_type,title,rating
878,on the surface this film is a pretty good cour...,dvd,Subtexts,2.0
4805,cup leaks out of the top closure part. i need ...,kitchen & housewares,Cup leaks,1.0
6931,this printer was purchased for my wife's birth...,electronics,HP Photosmart 335 Printer,5.0
2846,if you look at the march 2004 issue of the int...,books,Don't buy this book. Check with the experts in...,1.0
5470,"this peeler is amazing, and really inexpensive...",kitchen & housewares,Best Peeler Ever!,5.0


In [None]:
df.rating.value_counts()

5.0    2839
1.0    2405
2.0    1552
4.0    1107
Name: rating, dtype: int64

Here a small simplification is made. The model will be trained to classify reviews as `positive` or `negative` since predicting exact number of stars is a sort of ill-posed problem: number of stars is very subjective and one often cannot exactly say whether a negative review is two or one star, same with positive. For this reason and due to the absence of 3-star reviews we'll assign all entries with 1 and 2 stars a `negative` label and all with 4 or 5 a `positive` label.

# Prepare dataset

In [63]:
df_working = df.copy()

In [64]:
df_working['mood'] = (df_working.rating > 2).astype(int)


In [65]:
df_working.sample(1)

Unnamed: 0,text,product_type,title,rating,mood
111,"saw this as an inflight movie. boy, did i pray...",dvd,Praying for engine trouble!,2.0,0


# Create vocabulary

Here I want to use `word_tokenize()` from NLTK library as tokenizer in `TokenTextEncoder` since it's a better tokenizer than the standard one in TensorFlow. However `TokenTextEncoder` only accepts objects with `tokenize()` function, meaning I need to wrap `nltk.word_tokenize()` with a class.


In [66]:
# class-wrapper for tokenize() function.
class CustomTokenizer():
  
  def tokenize(self, text):
    return nltk.word_tokenize(text)

ct = CustomTokenizer()

Create vocabulary from all texts and use it to train `TokenTextEncoder`.


In [67]:
counter = Counter()

df_working.text.apply(lambda x: counter.update(nltk.word_tokenize(x)))

encoder = tfds.features.text.TokenTextEncoder(list(counter.keys()), tokenizer=ct)


In [69]:
words = pd.DataFrame({'word':list(counter.keys()),'n':list(counter.values())})

Create a vocabulary of words with less than five occurences.

In [70]:
rare = set(words[words.n < 5].word.to_numpy())
rare_enc = set([encoder.encode(rare_word)[0] for rare_word in rare])


Set to zeros encoding values of least frequent words in the internal dictionary of `TokenTextEncoder`.


In [71]:
for least_frequent in rare:
  encoder._token_to_id[least_frequent] = 0 

Let's create a dataset from strings of text and then transform them into arrays of separately encoded words with `Dataset.map()` function. Since this is graph execution one might want to use `tf.py_function()` to wrap the function that does the actual transformation.

In [74]:
def g(target,label):
  return encoder.encode(target.numpy()),tf.cast(label,tf.int64)

In [75]:
def f(target,label):
  res = tf.py_function(g, inp = [target,label], Tout=(tf.int64,tf.int64))
  return res

In [76]:
dataset_orig = tf.data.Dataset.from_tensor_slices((df_working.text, df_working.mood))

Apply the transformation, shuffle and pad the samples.

In [77]:
dataset = dataset_orig.map(f).shuffle(df.shape[0]).padded_batch(batch_size=8, padded_shapes=([None], []))

In [78]:
for i in dataset.take(2):
  print(i)

(<tf.Tensor: shape=(8, 243), dtype=int64, numpy=
array([[   2,  446,   77, ...,    0,    0,    0],
       [ 135,    6, 1223, ...,  107,  289,   12],
       [  71,   19,  114, ...,    0,    0,    0],
       ...,
       [ 616,  135,    6, ...,    0,    0,    0],
       [  71,  486, 1974, ...,    0,    0,    0],
       [ 202,   92,  794, ...,    0,    0,    0]])>, <tf.Tensor: shape=(8,), dtype=int64, numpy=array([1, 0, 1, 0, 1, 0, 1, 1])>)
(<tf.Tensor: shape=(8, 288), dtype=int64, numpy=
array([[1704,   21,    9, ...,    0,    0,    0],
       [ 616,    1,    1, ...,   12, 9469,    1],
       [ 369,    7, 2380, ...,    0,    0,    0],
       ...,
       [  71,  114,  445, ...,    0,    0,    0],
       [8980, 1487,   14, ...,    0,    0,    0],
       [ 428,  422,   34, ...,    0,    0,    0]])>, <tf.Tensor: shape=(8,), dtype=int64, numpy=array([1, 1, 1, 0, 0, 0, 0, 0])>)


In [79]:
N = dataset.cardinality().numpy()
N

988

Split dataset into training, test and validation sets as 80 / 10 / 10.

In [84]:
train,test = dataset.skip(N//5),dataset.take(N//5)

In [85]:
valid, test = test.take(N//10), test.skip(N//10)

In [87]:
train.cardinality().numpy() + test.cardinality().numpy() + valid.cardinality().numpy()

988

# Model

In [80]:
chkppts_path = '/content/drive/My Drive/colab/chkpts'
mc_callback = tf.keras.callbacks.ModelCheckpoint(chkppts_path, save_best_only=True)

In [81]:
dim = 64

In [82]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(len(counter.items())+1, dim))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(dim)))
model.add(tf.keras.layers.Dense(dim))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [83]:
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [88]:
model.fit(train, epochs=5, validation_data=valid, callbacks=mc_callback)

Epoch 1/5
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /content/drive/My Drive/colab/chkpts/assets
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f76706732b0>

In [89]:
model.evaluate(test)



[0.0908358246088028, 0.9734848737716675]

The validation accuracy is already high. Let's try to train model for few more epochs. 

In [91]:
model.fit(train, epochs=3, validation_data=valid, callbacks=mc_callback)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f765b5cd668>

In [92]:
model.evaluate(test)



[0.055553827434778214, 0.9810606241226196]

An increase of roughly 0.01 after 3 epochs is a fairly small one. Apparently the model starts to overfit so I'd better stop here. 

In [None]:
model.save(chkppts_path)