# Sentiment Analysis of E-Commerce Clothing Reviews

Training an evaluation set from <a href="https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews">kaggle</a>

Approach
* set each review rated higher than or equal to 3 as positive review
* set each review rated below 3 as negative review
* build embedding matrix using Google's pre-trained word2vec model
* train CNN classifier for binary sentiment analysis

In [37]:
# preprocessing
import os
import re
import nltk
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# word embeddings
from gensim.models import KeyedVectors
import pickle

# CNN architecture
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Activation, Dropout, BatchNormalization

# CNN training
from keras import callbacks
from keras.callbacks import EarlyStopping, ModelCheckpoint

# CNN load models
from keras.models import load_model

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
DATA_DIR = '../Data/'

## Preprocessing
* load data
* select columns (title, review text and rating)
* drop all rows that which don't have at least a rating and review text
* combine title and review text to one string separated by a punct
* remove non-word/-number/-punctuation characters
* remove english stopwords
* remove the five most common words that appear in both classes
* label each positive rating with 0
* label each negative rating with 1

In [18]:
keep_words_and_punct = r"[^a-zA-Z0-9?!.]|[\.]{2,}"
mult_whitespaces = "\s{2,}"

df = pd.read_csv(DATA_DIR + 'review_data.csv')
reviews = df.loc[:,('Title', 'Review Text', 'Rating')]
reviews.dropna(how="any", inplace=True, subset=['Review Text', 'Rating'])

texts = []
labels = []
stop_words = set(stopwords.words('english'))

duplicate_words = ['dress', 'size', 'top', 'fit', 'like']

for i, row in reviews.iterrows():
    review = str(row['Title']) + '. ' + str(row['Review Text'])
    clean_review = re.sub(mult_whitespaces, ' ', re.sub(keep_words_and_punct, ' ', str(review).lower()))
    tokens = word_tokenize(clean_review)
    filtered_sentence = [word for word in tokens if not word in stop_words and not word in duplicate_words]
    sentences = " ".join(filtered_sentence)

    if row['Rating'] >= 3:
        texts.append(sentences)
        labels.append(0)
    else:
        texts.append(sentences)
        labels.append(1)

print(texts[:10])

['nan . absolutely wonderful silky sexy comfortable', 'nan . love ! sooo pretty . happened find store glad bc never would ordered online bc petite . bought petite 5 8 . love length hits little knee . would definitely true midi someone truly petite .', 'major design flaws . high hopes really wanted work . initially ordered petite small usual found outrageously small . small fact could zip ! reordered petite medium ok. overall half comfortable nicely bottom half tight layer several somewhat cheap net layers . imo major design flaw net layer sewn directly zipper c', 'favorite buy ! . love love love jumpsuit . fun flirty fabulous ! every time wear get nothing great compliments !', 'flattering shirt . shirt flattering due adjustable front tie . perfect length wear leggings sleeveless pairs well cardigan . love shirt ! ! !', 'petite . love tracy reese dresses one petite . 5 feet tall usually wear 0p brand . pretty package lot . skirt long full overwhelmed small frame . stranger alterations s

## Tokenize reviews and find longest word sequence

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

max_sequence_len = 0
for sequence in sequences:
    if len(sequence) > max_sequence_len:
        max_sequence_len = len(sequence)
print("max sequence len: %i" % max_sequence_len)

Found 14602 unique tokens.
max sequence len: 67


### Pad all sequences to the longest length

In [20]:
data = pad_sequences(sequences, maxlen=max_sequence_len)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (22641, 67)
Shape of label tensor: (22641, 2)


### Split the data into a training set and a validation set

In [14]:
VALIDATION_SPLIT = 0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

# Load pre-trained word embeddings

### Word2Vec model from Google - trained on google news article

In [24]:
EMBEDDING_DIM = 300

model = KeyedVectors.load_word2vec_format(DATA_DIR + 'word embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=25000)

embeddings_index = {}
for word in range(len(model.vocab)):
    embedding_vector = model[model.index2word[word]]
    embeddings_index[model.index2word[word]] = embedding_vector

### GloVe model from Stanford University - trained on 20 newsgroups dataset

In [31]:
# USE PRETRAINED GLOVE WORD EMBEDDINGS (trained on 20 newsgroups)
EMBEDDING_DIM = 100

if os.path.isfile(DATA_DIR + 'word embeddings/glove_embeddings_index.pkl'):
    with open(DATA_DIR + 'word embeddings/glove_embeddings_index.pkl', 'rb') as file:
        embeddings_index = pickle.load(file)
else:
    embeddings_index = {}
    f = open(DATA_DIR + 'word embeddings/glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    with open(DATA_DIR + 'word embeddings/glove_embeddings_index.pkl', 'wb') as file:
        pickle.dump(embeddings_index, file)

## Build embedding matrix

In [32]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## Set up the Convolutional Neural Network (CNN)
### set parameters

In [34]:
BATCH_SIZE = 32
FILTERS = 300
KERNEL_SIZE = 3
HIDDEN_DIMS = 250
EPOCHS = 50
P_DROPOUT = 0.2
labels_index = {'pos': 0, 'neg': 1}

### Define the structure/architecture
Embedding layer > convolution layer > sigmoid function for classification

In between dropout and normalization

In [35]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=max_sequence_len,
                    trainable=False))  # prevent keras from updating the word indices during training process

model.add(BatchNormalization())
model.add(Conv1D(FILTERS,
                 KERNEL_SIZE,
                 padding='same',
                 activation='relu',
                 strides=1))

model.add(GlobalMaxPooling1D())
model.add(Dense(HIDDEN_DIMS))
model.add(Dropout(P_DROPOUT))
model.add(Activation('relu'))
model.add(Dense(len(labels_index)))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Start training the CNN

In [38]:
checkpointer = ModelCheckpoint(filepath=DATA_DIR + 'sentiment_sequential.hdf5', verbose=1, save_best_only=True)
earlyStopper = EarlyStopping(monitor='val_loss', min_delta=0, patience=7, verbose=0, mode='auto')
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

model.fit(x_train, y_train,
          batch_size=BATCH_SIZE,<
          epochs=EPOCHS,
          validation_data=(x_val, y_val),
          callbacks=[checkpointer, earlyStopper, reduce_lr],
          verbose=2)


Train on 18113 samples, validate on 4528 samples
Epoch 1/50
 - 32s - loss: 0.2605 - acc: 0.9011 - val_loss: 0.1969 - val_acc: 0.9174

Epoch 00001: val_loss improved from inf to 0.19686, saving model to ../Data/sentiment_sequential.hdf5
Epoch 2/50
 - 39s - loss: 0.1843 - acc: 0.9203 - val_loss: 0.1912 - val_acc: 0.9153

Epoch 00002: val_loss improved from 0.19686 to 0.19124, saving model to ../Data/sentiment_sequential.hdf5
Epoch 3/50
 - 35s - loss: 0.1354 - acc: 0.9438 - val_loss: 0.2174 - val_acc: 0.9142

Epoch 00003: val_loss did not improve from 0.19124
Epoch 4/50
 - 35s - loss: 0.0938 - acc: 0.9626 - val_loss: 0.2455 - val_acc: 0.9142

Epoch 00004: val_loss did not improve from 0.19124
Epoch 5/50
 - 33s - loss: 0.0624 - acc: 0.9761 - val_loss: 0.2836 - val_acc: 0.8954

Epoch 00005: val_loss did not improve from 0.19124
Epoch 6/50
 - 37s - loss: 0.0181 - acc: 0.9942 - val_loss: 0.3600 - val_acc: 0.9125

Epoch 00006: val_loss did not improve from 0.19124
Epoch 7/50
 - 36s - loss: 0.0

<keras.callbacks.History at 0x23c60aa5c18>

## Evaluation of the stored model

In [40]:
best_model = load_model(DATA_DIR + 'sentiment_sequential.hdf5')

scores = best_model.evaluate(x_val, y_val, verbose=0, batch_size=BATCH_SIZE)
print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss: %.2f%%" % (scores[0]*100))

Accuracy: 91.53%
Loss: 19.12%
