In [None]:
#deep learning example
#use LSTM to predict review sentiment class - positive, neutral and negative

In [1]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import time
import pandas as pd
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhiyiwang\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
#labels have to start from zero
reviews_df = pd.read_csv('./clothing_reviews.csv')
reviews_df['Review Text'] = reviews_df['Review Text'].astype(str)
zero_number = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews_df['Rating'] = reviews_df['Rating'].apply(lambda x: zero_number[x])
cat_number = {0:0, 1:0, 2:1, 3:2, 4:2}
reviews_df['Sentiment'] = reviews_df['Rating'].apply(lambda x: cat_number[x])
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Sentiment
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,3,1,0,Initmates,Intimate,Intimates,2
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,4,1,4,General,Dresses,Dresses,2
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,2,0,0,General,Dresses,Dresses,1
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",4,1,0,General Petite,Bottoms,Pants,2
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,4,1,6,General,Tops,Blouses,2


In [4]:
#remove some irrelevant/messy information
reviews_df = reviews_df.reset_index(drop=True)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

reviews_df['review_clean'] = reviews_df['Review Text'].apply(clean_text)
reviews_df['review_clean'] = reviews_df['review_clean'].str.replace('\d+', '')
reviews_df.head()

  reviews_df['review_clean'] = reviews_df['review_clean'].str.replace('\d+', '')


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Sentiment,review_clean
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,3,1,0,Initmates,Intimate,Intimates,2,absolutely wonderful silky sexy comfortable
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,4,1,4,General,Dresses,Dresses,2,love dress sooo pretty happened find store im ...
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,2,0,0,General,Dresses,Dresses,1,high hopes dress really wanted work initially ...
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",4,1,0,General Petite,Bottoms,Pants,2,love love love jumpsuit fun flirty fabulous ev...
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,4,1,6,General,Tops,Blouses,2,shirt flattering due adjustable front tie perf...


In [5]:
#tokenize the words in the text
tokenizer = RegexpTokenizer(r'\w+')

counts = Counter()
for index, row in reviews_df.iterrows():
    counts.update(tokenizer.tokenize(row['review_clean']))

#create vocabulary and word counts
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [6]:
#check number of words in text to set max length
count = reviews_df['review_clean'].str.split().str.len().value_counts()
count.index = count.index.astype(int)
count.sort_index(inplace=True)
count.tail(50)

11    452
12    446
13    511
14    501
15    553
16    563
17    584
18    516
19    545
20    587
21    555
22    498
23    584
24    513
25    529
26    492
27    455
28    500
29    486
30    480
31    463
32    406
33    451
34    412
35    416
36    377
37    413
38    388
39    371
40    421
41    432
42    552
43    592
44    652
45    663
46    707
47    646
48    546
49    434
50    315
51    237
52    158
53     73
54     51
55     21
56     19
57      9
58      1
59      1
61      1
Name: review_clean, dtype: int64

In [7]:
def pad_sequence(sentence, length):
    if len(sentence) > length:
        sentence = sentence[:length]
    else:
        pad_len = length - len(sentence)
        for _ in range(pad_len):
            sentence.append(vocab2index[""])
    assert len(sentence) == length
    return sentence

def text_sentence(sentences, length):
    #turn each word into index in vocabulary
    sentence_list = []
    for i, sen in enumerate(sentences):
        print('sentence count #{}'.format(i+1), end='\r')
        sentence_idx = []
        for word in sen:
            if (word in vocab2index.keys()):
                sentence_idx.append(vocab2index[word])
            else:
                sentence_idx.append(vocab2index["UNK"])
        #pad sentence to make every sentence have same length
        sentence_idx = pad_sequence(sentence_idx, length)
        sentence_list.append(sentence_idx)
    return torch.LongTensor(sentence_list)

def labels_tensor(y):
    #turn labels into sensor
    y = [int(label) for label in y]
    return torch.LongTensor(y)

In [8]:
#max number of words in each sequence
max_sequence_length = 60

x_data = reviews_df['review_clean'].values
#y_data = reviews_df['Rating'].values
y_data = reviews_df['Sentiment'].values

from sklearn.model_selection import train_test_split
x_train_data, x_test_data, y_train_data, y_test_data = train_test_split(x_data, y_data, test_size=0.2)

x_train = text_sentence(x_train_data, max_sequence_length)
x_test = text_sentence(x_test_data, max_sequence_length)
y_train = labels_tensor(y_train_data)
y_test = labels_tensor(y_test_data)

sentence count #46988

In [9]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

In [10]:
train_set = TextDataset(x_train, y_train)
test_set = TextDataset(x_test, y_test)

batch_size = 128
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size)

In [11]:
class LSTM_Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.25):
        super(LSTM_Net, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential(nn.Dropout(dropout),
                                        nn.Linear(hidden_dim, 3))
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        #lstm_out, (ht, ct) = self.lstm(x)
        #x = self.classifier(ht[-1])
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

In [12]:
#create instance of neural network
vocab_size = len(vocab2index)
model = LSTM_Net(vocab_size=vocab_size, embedding_dim=200, hidden_dim=100, num_layers=1, dropout=0.25)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = model.to(device)

loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
epochs = 30

for epoch in range(epochs):
    epoch_start_time = time.time()
    train_acc = 0.0
    train_loss = 0.0
    test_acc = 0.0
    test_loss = 0.0

    model.train()
    for i, data in enumerate(train_loader):
        inputs = data[0].to(device)
        labels = data[1].to(device)
        
        optimizer.zero_grad()
        train_predicted = model.forward(inputs)
        batch_loss = loss(train_predicted, labels)
        batch_loss.backward()
        optimizer.step()

        train_acc += np.sum(np.argmax(train_predicted.cpu().data.numpy(), axis=1) == data[1].numpy())
        train_loss += batch_loss.item()
    
    with torch.no_grad():
        model.eval()
        for i, data in enumerate(test_loader):
            inputs = data[0].to(device)
            labels = data[1].to(device)
            
            test_predicted = model.forward(inputs)
            batch_loss = loss(test_predicted, labels)

            test_acc += np.sum(np.argmax(test_predicted.cpu().data.numpy(), axis=1) == data[1].numpy())
            test_loss += batch_loss.item()

        #print result for each epoch
        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % \
            (epoch + 1, epochs, time.time()-epoch_start_time, \
             train_acc/train_set.__len__(), train_loss/train_set.__len__(), test_acc/test_set.__len__(), test_loss/test_set.__len__()))

[001/030] 3.14 sec(s) Train Acc: 0.771982 Loss: 0.005572 | Val Acc: 0.777139 loss: 0.005389
[002/030] 1.35 sec(s) Train Acc: 0.774803 Loss: 0.005389 | Val Acc: 0.777139 loss: 0.005373
[003/030] 1.35 sec(s) Train Acc: 0.774803 Loss: 0.005320 | Val Acc: 0.777139 loss: 0.005273
[004/030] 1.36 sec(s) Train Acc: 0.774803 Loss: 0.005173 | Val Acc: 0.777139 loss: 0.005168
[005/030] 1.36 sec(s) Train Acc: 0.775069 Loss: 0.005014 | Val Acc: 0.777139 loss: 0.005054
[006/030] 1.35 sec(s) Train Acc: 0.774963 Loss: 0.004874 | Val Acc: 0.777352 loss: 0.005185
[007/030] 1.41 sec(s) Train Acc: 0.774643 Loss: 0.004736 | Val Acc: 0.776713 loss: 0.004968
[008/030] 1.41 sec(s) Train Acc: 0.777837 Loss: 0.004655 | Val Acc: 0.773095 loss: 0.005026
[009/030] 1.36 sec(s) Train Acc: 0.781456 Loss: 0.004565 | Val Acc: 0.778629 loss: 0.004955
[010/030] 1.36 sec(s) Train Acc: 0.783319 Loss: 0.004481 | Val Acc: 0.778629 loss: 0.004903
[011/030] 1.34 sec(s) Train Acc: 0.784011 Loss: 0.004417 | Val Acc: 0.772669 los