# Environment Set Up

In [1]:
!pip install 'tensorflow==1.14'

In [3]:
from datetime import date
import datetime
import time
import collections 
import itertools
import re
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

from keras import layers
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense
from keras.models import Sequential

# Global Parameters

In [3]:
embedding_dim = 100
maxlen = 200

# Read and Preprocess Data

After reading the data we will perform a few pre-processing steps:

1. Remove the products with less than 100 reviews. This is a simple heuristic to both reduce the training set size and make sure that we have enough data to "synthetize" a super-review.
2. Make sure we don't have too many word not in embedding. 

In [6]:
path = 'Data'
fname = 'amazon_reviews_us_Musical_Instruments_v1_00.tsv'

os.path.join(path, fname)
df = pd.read_csv(os.path.join(path, fname), sep = '\t', error_bad_lines = False, warn_bad_lines = True)

In [5]:
# Find the products with less than 100 reviews
print("We started with {} unique ids".format(len(df['product_id'].unique())))
reviews_threshold = 100
pdist = df[['product_id', 'review_id']].groupby('product_id', as_index = False).count()
prods_to_keep = pdist[pdist['review_id'] > reviews_threshold]['product_id'].unique()

df = df[df['product_id'].isin(prods_to_keep)]
print("After cleansing we are left with {} products".format(len(df['product_id'].unique())))
print("Total dataset size is now {} rows".format(len(df)))

We started with 123284 unique ids
After cleansing we are left with 1105 products
Total dataset size is now 286055 rows


In [6]:
def create_text_and_labels(df):
  """Creates X vecotr and y label from reviews
  dataset
  
  Arguments:
    df: pandas dataframe
    
   Return:
    texts: vector of reviews 
    labels: binary vector
    id: binary vector
  
  """
  texts = []
  labels = []
  ids = []
  prod_ids = []
  
  for i in df.itertuples():
    texts.append(str(i.review_body))
    ids.append(i.review_id)
    prod_ids.append(i.product_id)
  
    if i.star_rating in (1,2,3):
      labels.append(0)
    else:
      labels.append(1)
   
  assert len(texts) == len(labels) == len(ids)
  
  return texts, labels, ids, prod_ids

# Create Text Labels and Id vectors
texts, labels, ids, prod_ids = create_text_and_labels(df)

In [7]:
# Read in contractions, needed to clean up text
outpath = 'Data'
with open(os.path.join(outpath, 'contractions.pickle'), "rb") as f:
  contractions = pickle.load(f)

def clean_text(text, contractions, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'<br  >', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text.rstrip()

# Clean up words using function defined above
cleaned_text = []

for t in texts:
  
  new_t = clean_text(t, contractions, remove_stopwords = False)
  cleaned_text.append(new_t)

In [8]:
%%time
# Tokenize reviews, using only a limited amount of words.
# NOTE: the tokenizer ignores words that are not in the vocubalary, without putting a placeholder
max_words = 100000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(cleaned_text)
sequences = tokenizer.texts_to_sequences(cleaned_text)
sequences

CPU times: user 22.8 s, sys: 26.9 ms, total: 22.9 s
Wall time: 22.9 s


In [10]:
%%time

def get_glove_embedding(path):
  
  embedding_index = {}

  with open(path) as f:
  
    for line in f:
    
      values = line.split()
      word = values[0]
      coefs = np.array(values[1:])
      embedding_index[word] = coefs
      
  return embedding_index

# Import Glove embedding
path = 'Data'
glove_path = 'glove.6B.100d.txt'

embedding_index = get_glove_embedding(os.path.join(path, glove_path))

CPU times: user 7.58 s, sys: 845 ms, total: 8.43 s
Wall time: 8.42 s


In [11]:
word_index = tokenizer.word_index

embedded_words = set(embedding_index.keys())
target_words = set(word_index.keys())

words_diff = list(target_words.difference(embedded_words))
print("There ere {} unique words that cannot be found in the embedding".format(len(words_diff)))

# Find the top words not in the embedding
no_embedding = sorted([(word_index[x], tokenizer.word_counts[x]) for x in words_diff])

# A few words won't have an embedding, let's check the top 20
for ne in no_embedding[:20]:
  print("Word :", tokenizer.index_word[ne[0]], "||  Frequency: ", ne[1])

# TODO - we can think at manually cleaning mispelled words like 'recomend', excellet', 'awsome'
pd.Series([x[1] for x in no_embedding]).describe()
np.percentile([x[1] for x in no_embedding], 99)

There ere 37084 unique words that cannot be found in the embedding
Word : m50s ||  Frequency:  890
Word : m50x ||  Frequency:  871
Word : h4n ||  Frequency:  773
Word : fiio ||  Frequency:  663
Word : sennheisers ||  Frequency:  635
Word : 7506 ||  Frequency:  563
Word : recomend ||  Frequency:  536
Word : sm58 ||  Frequency:  511
Word : at2020 ||  Frequency:  490
Word : earpads ||  Frequency:  444
Word : beyerdynamic ||  Frequency:  432
Word : lavalier ||  Frequency:  427
Word : mxl ||  Frequency:  406
Word : videoid ||  Frequency:  375
Word : shockmount ||  Frequency:  359
Word : hitlights ||  Frequency:  344
Word : hd650 ||  Frequency:  337
Word : ad700 ||  Frequency:  335
Word : excelent ||  Frequency:  334
Word : ukes ||  Frequency:  334


38.0

In [12]:
# Create Embedding Matrix

embedding_dim = 100
max_words = len(word_index.keys())+1

embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
  
  if i < max_words:

    embedding_vector = embedding_index.get(word)
    
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [13]:
# Find a parameter to decide how many words we take in

length_dist = [len(x) for x in sequences]
print(max(length_dist))
print("95% of the data have at most {} words".format(np.percentile(length_dist, 95)))

4491
95% of the data have at most 195.0 words


In [14]:
%%time
maxlen = 200
# Pad data to normalize sequences to be 200 words (captures 95% of reviews)
data = pad_sequences(sequences, maxlen = maxlen)

CPU times: user 2.06 s, sys: 11.8 ms, total: 2.07 s
Wall time: 2.07 s


# Model Training

In [15]:
embedding_dim = 100
maxlen = 200

In [16]:
# Randomnly Split Data based on product ids
# We will use reviews of 80% of the eligible products to build the model
# The rest to evaluate and test shap explainer
np.random.seed(55)
train_prods = np.random.choice(prods_to_keep, size = int(len(prods_to_keep)*0.7), replace = False)
test_prods = list(set(prods_to_keep).difference(set(train_prods)))
assert len(train_prods) + len(test_prods) == len(prods_to_keep)

indices = [idx if i in train_prods else -1 for idx, i in enumerate(prod_ids)]
train_indices = np.array(list(filter(lambda x: x != -1, indices)))
indices_test = [idx if i not in train_prods else -1 for idx, i in enumerate(prod_ids)]
test_indices = np.array(list(filter(lambda x: x != -1, indices_test)))
assert len(train_indices) + len(test_indices) == len(prod_ids)

x_train = data[train_indices]
y_train = np.array(labels)[train_indices]
ids_train = np.array(ids)[train_indices]
prod_ids_train = np.array(prod_ids)[train_indices]

x_test =  data[test_indices]
y_test =  np.array(labels)[test_indices]
ids_test = np.array(ids)[test_indices]
prod_ids_test = np.array(prod_ids)[test_indices]

In [16]:
# Save numpy arrays
outpath = 'Data'

np.save(os.path.join(outpath, 'x_train'), x_train)
np.save(os.path.join(outpath, 'y_train'), y_train)
np.save(os.path.join(outpath, 'ids_train'), ids_train)
np.save(os.path.join(outpath, 'prod_ids_train'), prod_ids_train)

np.save(os.path.join(outpath, 'x_test'), x_test)
np.save(os.path.join(outpath, 'y_test'), y_test)
np.save(os.path.join(outpath, 'ids_test'), ids_test)
np.save(os.path.join(outpath, 'prod_ids_test'), prod_ids_test)

with open(os.path.join(outpath, 'dictionary.pickle'), "wb") as output_file:
  pickle.dump(tokenizer, output_file)

In [17]:
# Define Model

def lstm_model(max_words, embedding_dim, input_length):

  model = Sequential()
  model.add(Embedding(max_words, embedding_dim, input_length=input_length))
  model.add(LSTM(100, return_sequences = False)) 
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  model.summary()

  return model

In [4]:
# By default we set the embedding matrix not to be trainable
model = lstm_model(max_words, embedding_dim, maxlen)

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

# checkpoint
filepath="Models/weights-lstm-{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(filepath,  verbose=1, save_best_only=False, mode='max')

# Tensorboard logs
log_dir="Logs/lstm" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

callbacks_list = [checkpoint, tensorboard_callback]

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=128,
                    callbacks=callbacks_list,
                    validation_data=(x_test, y_test))

# Train a simpler fully connected model to use as a benchmark

In [17]:
def fully_connected_model(max_words, embedding_dim, input_length):
    """First model to benchmark is a fully connected model
  
      Arguments:
        embedding_dim: size of the embedding space
        max_words: rows of the embedding matrix
        input_length: max length of text
    """
    model = Sequential()
    model.add(layers.Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(layers.Flatten())
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dense(32, activation='tanh'))
    model.add(layers.Dense(1, activation='sigmoid'))
  
    return model

In [5]:
# By default we set the embedding matrix not to be trainable
model = fully_connected_model(max_words, embedding_dim, maxlen)

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

# checkpoint
filepath = "Models/weights-fullyconnected-{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(filepath,  verbose=1, save_best_only=False, mode='max')

# Tensorboard logs
log_dir = "Logs/fullyconnected" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

callbacks_list = [tensorboard_callback]

history = model.fit(x_train, 
                    y_train,
                    epochs=10,
                    batch_size=128,
                    callbacks=callbacks_list,
                    validation_data=(x_test, y_test))