# Predicting Star Ratings Based on Amazon Review Comments of Digital Music Albums

For this project, I will be analyzing Amazon user reviews of digital music. My goal is to predict the star-rating a user will give a product based on the review comment they give. Since the star rating prediction is only based on a text review, it serves as a proxy for building a sentiment classifier – assuming that star rating is truly representative of a review’s sentiment.

## Data Loading and Preliminary Exploration

Import libraries for data processing

In [47]:
import os #used to manage directories
import json #for reading JSON file
import pandas as pd # for data cleaning
import nltk #used for text preprocessing
import numpy as np #for data scrubbing


In [48]:
import joblib

os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')
nn_bow = joblib.load("nn_bow.pkl")

In [49]:
demo_data = 'This is the worst album I have ever bought. It is terrible'

In [50]:
from nltk import word_tokenize
token = word_tokenize(demo_data)

In [51]:
from nltk.corpus import stopwords

stoplist = stopwords.words('english')

def removeStopWords(tokens):
    return [word for word in token if word not in stoplist]

filtered_words = [removeStopWords(sen) for sen in token]

clean_token = filtered_words #join to dataframe

# sample['Text_Final'] = [' '.join(sen) for sen in filtered_words]
# sample['tokens'] = filtered_words


clean_token = removeStopWords(token)
clean_token

['This', 'worst', 'album', 'I', 'ever', 'bought', '.', 'It', 'terrible']

In [52]:
import re, string, unicodedata
# import contractions
import inflect
from nltk.corpus import stopwords

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
    

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

#remove all punctuation
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words


def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    return words


In [53]:
normalized = normalize(clean_token)
normalized

['this', 'worst', 'album', 'i', 'ever', 'bought', 'it', 'terrible']

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(clean_token)

# X_train = vectorizer.transform(sentences_train)
# X_test  = vectorizer.transform(sentences_test)
demo = vectorizer.transform(clean_token)
print(demo)

  (0, 5)	1
  (1, 6)	1
  (2, 0)	1
  (4, 2)	1
  (5, 1)	1
  (7, 3)	1
  (8, 4)	1


In [55]:
demo.shape[1]


7

In [44]:
from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Dropout
from keras import layers


input_dim = demo.shape[1]  # Number of features


model = Sequential()
model.add(layers.Dense(1, input_dim=input_dim, activation='relu'))
# model.add(layers.Dropout(0.4))
model.add(layers.Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 1)                 8         
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 10        
Total params: 18
Trainable params: 18
Non-trainable params: 0
_________________________________________________________________


In [57]:
prediction_weights = model.predict(demo, batch_size = 64, verbose = 1)
prediction_weights



array([[0.18077333, 0.25869703, 0.17318615, 0.16586444, 0.22147907],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.18873821, 0.23471676, 0.18387929, 0.17911066, 0.21355502],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.19146639, 0.22641207, 0.18766475, 0.18391109, 0.21054572],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ]],
      dtype=float32)

In [58]:
y_pred = np.argmax(prediction_weights, axis = 1)
y_pred


array([1, 0, 1, 0, 0, 1, 0, 0, 0], dtype=int64)

## Predictive Models for Star Rating

### Bag of Words Method

With the BOW method, a matrix of vectors is created that represents the frequency of each word. Each vector corresponds to a review, and the width of each vector is the number of all distinct words in the entire corpus across all records. 

#### Testing on SVM Model

Split training and test data

In [None]:
print(sample['tokens'][1506350])

In [None]:
from sklearn.model_selection import train_test_split

# sentences = sample['tokens'].values
sentences = sample['tokens'].str.join(' ') ## join values into string
y = sample['overall'].values
# y = dummy_y

sentences_train, sentences_test, y_train, y_test = train_test_split(
   sentences, y, test_size=0.25, random_state=1000)

In [None]:
sentences_train[1506350]

Vectorize the training and test data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)



### Testing on Neural Net

For labeled data, encode the possible start values 1-5 into columns

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

Y = sample['overall'].values
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
dummy_y = np_utils.to_categorical(encoded_Y)
dummy_y

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

Y = sample['overall'].values
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
dummy_y = np_utils.to_categorical(encoded_Y)
dummy_y

from sklearn.model_selection import train_test_split

# sentences = sample['Text_Final'].values
sentences = sample['tokens'].str.join(' ') ## join values into string
#y = sample['overall'].values
y = dummy_y # use encoded labels for y

sentences_train, sentences_test, y_train, y_test = train_test_split(
   sentences, y, test_size=0.25, random_state=1000)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

First, I built a sequential model

In [None]:
from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Dropout
from keras import layers


input_dim = X_train.shape[1]  # Number of features


model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()

Train Model

In [None]:
start_time = datetime.now()

nn_bow = model.fit(X_train, y_train,
epochs=20,
verbose=False,
validation_data=(X_test, y_test),
batch_size = 10)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time)) #print out how long it took for model to train

Save as pickle

In [None]:
os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')


joblib.dump(nn_bow, "nn_bow.pkl")

Load pickle

In [None]:
# os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')
# nn_bow = joblib.load("nn_bow.pkl")

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(nn_bow):
    acc = nn_bow.history['accuracy']
    val_acc = nn_bow.history['val_accuracy']
    loss = nn_bow.history['loss']
    val_loss = nn_bow.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()


In [None]:
plot_history(nn_bow)

Alternative baseline model:https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/

In [None]:
loss_trn, accuracy_trn = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_trn))
loss_test, accuracy_test = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test))

In [None]:
import seaborn as sns
# clf = SVC(kernel = 'linear').fit(x_train,y_train)
# clf.predict(x_train)
# y_pred = clf.predict(x_test)

# Creates a confusion matrix
cm = confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1)) 

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['1', '2', '3', '4', '5'], 
                     columns = ['1', '2', '3', '4', '5'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True, fmt = 'g', cmap = 'Blues')
# plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(model.evaluate(X_test, y_test, verbose=False)))
# plt.title("Training Accuracy: {:.4f}".format(accuracy_trn) + "\n Testing Accuracy:  {:.4f}".format(accuracy_test))
plt.title("Neural Network")
plt.ylabel('True label')
plt.xlabel('Predicted label')
# plt.ticklabel_format(style = 'plain', axis = 'y', useOffset = False)
plt.show()

loss_trn, accuracy_trn = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy_trn))
loss_test, accuracy_test = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy_test))

https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model

In [None]:
from keras import backend as K

def recall_m(y_test, predictions):
    true_positives = K.sum(K.round(K.clip(y_test * predictions, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_test * predictions, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(predictions, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_test, predictions):
    precision = precision_m(y_test, predictions)
    recall = recall_m(y_test, predictions)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from sklearn.metrics import classification_report

y_test_norm = y_test.argmax(axis = 1)
y_test_norm

predictions = model.predict(X_test, batch_size = 64, verbose = 1)
y_pred = np.argmax(predictions, axis = 1)
y_pred

print(classification_report(y_test_norm, y_pred))

In [None]:
y_test.argmax(axis = 1)

In [None]:
pred_stars = predictions.argmax(axis = 1).tolist()
pred_stars = [x+1 for x in pred_stars]
pred_stars[0:4]

In [None]:
test_indexes = sentences_test.index
# sample.iloc[test_indexes]
# sample.iloc[[5441],:]

In [None]:
pd.options.display.max_colwidth = 100

sample_df_test = sample.loc[test_indexes]
sample_df_test['pred_stars'] = pred_stars

sample_df_test['diff_pred'] = abs(sample_df_test['pred_stars'] - pd.to_numeric(sample_df_test.overall))

sample_df_test = sample_df_test.sort_values(by = ['diff_pred'], ascending = False)

sample_df_test = sample_df_test.filter(['overall', 'reviewText', 'tokens', 'pred_stars', 'diff_pred'])
sample_df_test.head(10)

In [None]:
sample_df_test.groupby(['diff_pred']).size()

## Word Embedding

There are 3 main ways to embed: 
1. Words represented by each word as a vector
2. Characters represented by each character as a vector
3. N-grams of words/characters represented as a vector (N-grams are overlapping groups of multiple succeeding words/characters in the text)

I will use method #1. Two ways to do this is via one-hot encoding and word embeddings.

### One-hot encoding

In [None]:
Y = sample['overall'].values
encoder = LabelEncoder()
encoder.fit(Y)
star_labels = encoder.fit_transform(Y)
star_labels

In [None]:
from sklearn.preprocessing import OneHotEncoder

len_star = len(star_labels)

encoder = OneHotEncoder(sparse=False)
star_labels = star_labels.reshape((len_star, 1))
encoder.fit_transform(star_labels)

### Using Word Embeddings

Use Keras' tokenizer to tokenize all words as numeric values. Using the num_words parameter does not have 

In [None]:
sentences_train.iloc[2]

In [None]:
from keras.preprocessing.text import Tokenizer

# tokenizer = Tokenizer(num_words = None) #don't filter on top N most common words
tokenizer = Tokenizer(num_words = 5000) # filter on top 5000 most common words
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index)+1  #adding 1 because of reserved 0 index


maxlen = max([len(listElem.split()) for listElem in sentences]) ##find length of review with max number of words

print(sentences_train[776251]) #print sample review
print(X_train[2]) #print word indexing of sample reiew
len(X_train[2]) #count number of words in sample review

Find index of sample words

In [None]:
for word in ['good', 'bad', 'terrible']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

Find top 10 most frequent words

In [None]:
tokens_list = list(tokenizer.word_index)
tokens_list[0:10]

### Padding

In [None]:
maxlen = max([len(listElem.split()) for listElem in sentences]) ##find length of review with max number of words

In [None]:
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0:5, :])

In [None]:
from keras.models import Sequential
from keras import layers
from keras import regularizers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
# model.add(layers.Dense(10, activation='relu', kernel_regularizer=regularizers.l2(l=0.1)))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Check: https://github.com/tflearn/tflearn/issues/260

In [None]:
start_time = datetime.now()

nn_embed = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time)) #print out how long it took for model to train



In [None]:
os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')

joblib.dump(nn_embed, "nn_embed.pkl")

In [None]:
# os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')
# nn_embed = joblib.load("nn_embed.pkl")

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


In [None]:
plot_history(nn_embed)

In [None]:
import seaborn as sns
# clf = SVC(kernel = 'linear').fit(x_train,y_train)
# clf.predict(x_train)
# y_pred = clf.predict(x_test)

# Creates a confusion matrix
cm = confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1)) 

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['1', '2', '3', '4', '5'], 
                     columns = ['1', '2', '3', '4', '5'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True, fmt = 'g', cmap = 'Blues')
# plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(model.evaluate(X_test, y_test, verbose=False)))
# plt.title("Training Accuracy: {:.4f}".format(accuracy_trn) + "\n Testing Accuracy:  {:.4f}".format(accuracy_test))
plt.title("Neural Network with Word Embedding")
plt.ylabel('True label')
plt.xlabel('Predicted label')
# plt.ticklabel_format(style = 'plain', axis = 'y', useOffset = False)
plt.show()

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


In [None]:
from sklearn.metrics import classification_report

y_test_norm = y_test.argmax(axis = 1)
y_test_norm

predictions = model.predict(X_test, batch_size = 64, verbose = 1)
y_pred = np.argmax(predictions, axis = 1)
y_pred

print(classification_report(y_test_norm, y_pred))

In [None]:
pred_stars = predictions.argmax(axis = 1).tolist()
pred_stars = [x+1 for x in pred_stars]
pred_stars[0:4]

In [None]:
test_indexes = sentences_test.index
# sample.iloc[test_indexes]
# sample.iloc[[5441],:]

In [None]:
pd.options.display.max_colwidth = 100

sample_df_test = sample.loc[test_indexes]
sample_df_test['pred_stars'] = pred_stars

sample_df_test['diff_pred'] = abs(sample_df_test['pred_stars'] - pd.to_numeric(sample_df_test.overall))

sample_df_test = sample_df_test.sort_values(by = ['diff_pred'], ascending = False)

sample_df_test = sample_df_test.filter(['overall', 'reviewText', 'tokens', 'pred_stars', 'diff_pred'])
sample_df_test.head(10)

In [None]:
sample_df_test.groupby(['diff_pred']).size()

## CNN

In [None]:
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 2, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
start_time = datetime.now()

cnn_model = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time)) #print out how long it took for model to train


In [None]:
os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')

joblib.dump(cnn_model, "cnn_model.pkl")

In [None]:
# os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')

# cnn_model = joblib.load("cnn_model.pkl")

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


In [None]:
plot_history(cnn_model)

In [None]:
y_test[1]

https://stackoverflow.com/questions/50920908/get-confusion-matrix-from-a-keras-multiclass-model

In [None]:
import seaborn as sns
# clf = SVC(kernel = 'linear').fit(x_train,y_train)
# clf.predict(x_train)
# y_pred = clf.predict(x_test)

# Creates a confusion matrix
cm = confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1)) 

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['1', '2', '3', '4', '5'], 
                     columns = ['1', '2', '3', '4', '5'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True, fmt = 'g', cmap = 'Blues')
# plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(model.evaluate(X_test, y_test, verbose=False)))
# plt.title("Training Accuracy: {:.4f}".format(accuracy_trn) + "\n Testing Accuracy:  {:.4f}".format(accuracy_test))
plt.title("CNN with Word Embedding")
plt.ylabel('True label')
plt.xlabel('Predicted label')
# plt.ticklabel_format(style = 'plain', axis = 'y', useOffset = False)
plt.show()

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
from keras import backend as K

def recall_m(y_test, predictions):
    true_positives = K.sum(K.round(K.clip(y_test * predictions, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_test * predictions, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(predictions, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_test, predictions):
    precision = precision_m(y_test, predictions)
    recall = recall_m(y_test, predictions)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from sklearn.metrics import classification_report

y_test_norm = y_test.argmax(axis = 1)
y_test_norm

predictions = model.predict(X_test, batch_size = 64, verbose = 1)
y_pred = np.argmax(predictions, axis = 1)
y_pred

print(classification_report(y_test_norm, y_pred))

In [None]:
pred_stars = predictions.argmax(axis = 1).tolist()
pred_stars = [x+1 for x in pred_stars]
pred_stars[0:4]

In [None]:
test_indexes = sentences_test.index
# sample.iloc[test_indexes]
# sample.iloc[[5441],:]

In [None]:
pd.options.display.max_colwidth = 100

sample_df_test = sample.loc[test_indexes]
sample_df_test['pred_stars'] = pred_stars

sample_df_test['diff_pred'] = abs(sample_df_test['pred_stars'] - pd.to_numeric(sample_df_test.overall))

sample_df_test = sample_df_test.sort_values(by = ['diff_pred'], ascending = False)

sample_df_test = sample_df_test.filter(['overall', 'reviewText', 'tokens', 'pred_stars', 'diff_pred'])
sample_df_test.head(10)

In [None]:
sample_df_test.groupby(['diff_pred']).size()

## Pretrained gLOVE model Method 1

In [None]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding = "utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
os.chdir('C:\My_Files\Analytics\Amazon_reviews\glove_6B')


embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt',
    tokenizer.word_index, embedding_dim)


In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

In [None]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
start_time = datetime.now()

nn_glove = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time)) #print out how long it took for model to train


In [None]:
os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')

joblib.dump(nn_glove, "nn_glove.pkl")

In [None]:
# os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')
# nn_glove = joblib.load("nn_glove.pkl")

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
plot_history(nn_glove)

In [None]:
from sklearn.metrics import confusion_matrix

predictions = model.predict(X_test)
# y_pred = (predictions > 0.5)

# confusion_matrix(y_test, y_pred)

matrix = confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1))
matrix

In [None]:
import seaborn as sns
# clf = SVC(kernel = 'linear').fit(x_train,y_train)
# clf.predict(x_train)
# y_pred = clf.predict(x_test)

# Creates a confusion matrix
cm = confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1)) 

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['1', '2', '3', '4', '5'], 
                     columns = ['1', '2', '3', '4', '5'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True, fmt = 'g', cmap = 'Blues')
# plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(model.evaluate(X_test, y_test, verbose=False)))
# plt.title("Training Accuracy: {:.4f}".format(accuracy_trn) + "\n Testing Accuracy:  {:.4f}".format(accuracy_test))
plt.title("NN with Pre-Trained Word Embedding")
plt.ylabel('True label')
plt.xlabel('Predicted label')
# plt.ticklabel_format(style = 'plain', axis = 'y', useOffset = False)
plt.show()

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
from sklearn.metrics import classification_report

y_test_norm = y_test.argmax(axis = 1)
y_test_norm

predictions = model.predict(X_test, batch_size = 64, verbose = 1)
y_pred = np.argmax(predictions, axis = 1)
y_pred

print(classification_report(y_test_norm, y_pred))

In [None]:
pred_stars = predictions.argmax(axis = 1).tolist()
pred_stars = [x+1 for x in pred_stars]
pred_stars[0:4]

In [None]:
test_indexes = sentences_test.index
# sample.iloc[test_indexes]
# sample.iloc[[5441],:]

In [None]:
pd.options.display.max_colwidth = 100

sample_df_test = sample.loc[test_indexes]
sample_df_test['pred_stars'] = pred_stars

sample_df_test['diff_pred'] = abs(sample_df_test['pred_stars'] - pd.to_numeric(sample_df_test.overall))

sample_df_test = sample_df_test.sort_values(by = ['diff_pred'], ascending = False)

sample_df_test = sample_df_test.filter(['overall', 'reviewText', 'tokens', 'pred_stars', 'diff_pred'])
sample_df_test.head(10)

In [None]:
sample_df_test.groupby(['diff_pred']).size()

### CNN with Pre-trained Embeddings

In [None]:
from keras import optimizers

# embedding_dim = 50

# model = Sequential()
# model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen,
#                           weights = [embedding_matrix],
#                           trainable = True))
# model.add(layers.Conv1D(128, 5, activation='relu'))
# model.add(layers.GlobalMaxPooling1D())
# model.add(layers.Dense(10, activation='relu'))
# model.add(layers.Dense(5, activation='softmax'))
# model.compile(optimizer='adam',
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])
# model.summary()


embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen,
                          weights = [embedding_matrix],
                          trainable = True))
model.add(layers.Conv1D(128, 5, activation= 'softmax'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(15, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(5, activation='softmax'))
optimizer = optimizers.Adam(learning_rate = 0.01)
model.compile(optimizer = 'adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
model.summary()


In [None]:
start_time = datetime.now()

cnn_glove = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time)) #print out how long it took for model to train


In [None]:
os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')

joblib.dump(cnn_glove, "cnn_glove.pkl")

In [None]:
# os.chdir('C:\My_Files\Analytics\Amazon_reviews\pickles')
# cnn_glove = joblib.load("cnn_bow.pkl")

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
plot_history(cnn_glove)

In [None]:
from sklearn.metrics import confusion_matrix

predictions = model.predict(X_test)
# y_pred = (predictions > 0.5)

# confusion_matrix(y_test, y_pred)

matrix = confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1))
matrix

In [None]:
import seaborn as sns
# clf = SVC(kernel = 'linear').fit(x_train,y_train)
# clf.predict(x_train)
# y_pred = clf.predict(x_test)

# Creates a confusion matrix
cm = confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1)) 

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['1', '2', '3', '4', '5'], 
                     columns = ['1', '2', '3', '4', '5'])

plt.figure(figsize=(5.5,4))
sns.heatmap(cm_df, annot=True, fmt = 'g', cmap = 'Blues')
# plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(model.evaluate(X_test, y_test, verbose=False)))
# plt.title("Training Accuracy: {:.4f}".format(accuracy_trn) + "\n Testing Accuracy:  {:.4f}".format(accuracy_test))
plt.title("CNN with Pre-Trained Embedding")
plt.ylabel('True label')
plt.xlabel('Predicted label')
# plt.ticklabel_format(style = 'plain', axis = 'y', useOffset = False)
plt.show()

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
from sklearn.metrics import classification_report

y_test_norm = y_test.argmax(axis = 1)
y_test_norm

predictions = model.predict(X_test, batch_size = 64, verbose = 1)
y_pred = np.argmax(predictions, axis = 1)
y_pred

print(classification_report(y_test_norm, y_pred))

In [None]:
pred_stars = predictions.argmax(axis = 1).tolist()
pred_stars = [x+1 for x in pred_stars]
pred_stars[0:4]

In [None]:
test_indexes = sentences_test.index
# sample.iloc[test_indexes]
# sample.iloc[[5441],:]

In [None]:
pd.options.display.max_colwidth = 100

sample_df_test = sample.loc[test_indexes]
sample_df_test['pred_stars'] = pred_stars

sample_df_test['diff_pred'] = abs(sample_df_test['pred_stars'] - pd.to_numeric(sample_df_test.overall))

sample_df_test = sample_df_test.sort_values(by = ['diff_pred'], ascending = False)

sample_df_test = sample_df_test.filter(['overall', 'reviewText', 'tokens', 'pred_stars', 'diff_pred'])
sample_df_test.head(10)

In [None]:
sample_df_test.groupby(['diff_pred']).size()

https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

## Hyperparamaterization

Below, I used the GridSeach method for testing out various parameter combinations that would optimized performance. To limit the time needed for training on all parameter combinations, gridsearch was only performed on a sample dataset of 1000 records. The code below was tested in a separate notebook with a smaller dataframe sample size. The parameters I chose in my CNN model above were based on the results from the GridSearch

First, I set the parameter values I wanted to test

In [None]:
learning_rate = [0.001, 0.01, 0.1]
dropout_rate = [0.0, 0.2, 0.4, 0.6]
activation = ['relu', 'softmax']
neurons = [2, 5, 10, 15]

Then, I created a function that tests for each of these specified parameters, and printed out the optimal combination. I ended up choosing the 

In [None]:
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier


start_time = datetime.now()

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
# Create the parameter grid based on the results of random search 
    
param_grid = {
    'learning_rate': learning_rate,
    'dropout_rate': dropout_rate,
    'activation': activation,
    'neurons': neurons
}


def create_model(dropout_rate = 0.0, learning_rate = 0, neurons = 2, activation = 'relu'):
    embedding_dim = 50

    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen,
                              weights = [embedding_matrix],
                              trainable = True))
    model.add(layers.Conv1D(128, 5, activation= 'relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(neurons, activation='relu'))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(5, activation='softmax'))
    optimizer = optimizers.Adam(learning_rate = learning_rate)
    model.compile(optimizer = 'adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])
#     model.summary()
    return model

seed = 7
np.random.seed(seed)

model = KerasClassifier(build_fn= create_model, epochs = 5, batch_size = 10, verbose = 0)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = 3 
#                            n_jobs = -1
                          )


grid_result = grid_search.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time)) #print out how long it took for model to train

I then created a dataframe of all the gridsearch results. I ended up choosing the parameter combination with one of the highest mean scores from cross-fold validation, and also the smalled standard deviation of accuracy.

In [None]:
result_mean_scores = grid_result.cv_results_['mean_test_score']
result_std_scores = grid_result.cv_results_['std_test_score']
result_params = grid_result.cv_results_['params']
grid_results_df = pd.DataFrame(zip(result_params, result_mean_scores, result_std_scores), columns = ['Params', 'Mean_Score', 'Std_Score'])
grid_results_df.sort_values(by = ['Mean_Score'], ascending = False).head(20).reset_index(drop = True)

In [None]:
grid_result = grid_search.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))