<a href="https://colab.research.google.com/github/nommrichard/prod_rating/blob/main/rnn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting product rating based on review text 

## Project in LTAT.01.001 Natural language processing

#### Team members: Karl Jaagup Kask, Ludvig Leis, Richard Nõmm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


!python3 -m nltk.downloader stopwords
!python3 -m nltk.downloader punkt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
RANDOM_SEED = 100

The data we are using: https://www.kaggle.com/datafiniti/consumer-reviews-of-amazon-products

## Data preprocessing (data file needs to be imported)

In [3]:
df = pd.read_csv('amazon_review_dataset.csv') #renamed

In [4]:
df = df[df['reviews.rating'].notnull() & df['reviews.text'].notnull()]
df= df[['reviews.rating','reviews.text']]

df.rename(columns = {'reviews.rating':'rating', 'reviews.text':'text'}, inplace = True)
df['text'][0]
cleaned = df['text']
print(cleaned[0])

I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.


In [5]:

stops = stopwords.words('english')

remove_punc = re.compile('[^a-z]+')
def clean_text(sent):
    sent = str(sent).lower()
    sent = remove_punc.sub(' ', sent).strip()
    filtered = [word for word in sent.split()]
    sentence = " ".join(filtered) #just joined -> laused
  
    #sentence = nltk.word_tokenize(sentence) #tokenized -> listid
    return sentence


clean_text(df['text'][0])


'i order of them and one of the item is bad quality is missing backup spring so i have to put a pcs of aluminum to make the battery work'

In [6]:
reviews = [clean_text(sent) for sent in df['text']]

In [7]:
df.head()

Unnamed: 0,rating,text
0,3,I order 3 of them and one of the item is bad q...
1,4,Bulk is always the less expensive way to go fo...
2,5,Well they are not Duracell but for the price i...
3,5,Seem to work as well as name brand batteries a...
4,5,These batteries are very long lasting the pric...


## I RNN approach

In [8]:
y = pd.get_dummies(df.rating, prefix='rating_')
y.head()

Unnamed: 0,rating__1,rating__2,rating__3,rating__4,rating__5
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [10]:
import string

def clean_document(doco):
    punctuation = string.punctuation + '\n\n';
    punc_replace = ''.join([' ' for s in punctuation]);
    doco_clean = doco.replace('-', ' ');
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = str.maketrans(punctuation, punc_replace);
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
    doco_clean = doco_clean.split(' ');
    doco_clean = [word.lower() for word in doco_clean if len(word) > 0];
    
    return doco_clean;

# Generate a cleaned reviews array from original review texts
review_cleans = [clean_document(doc) for doc in reviews];
sentences = [' '.join(r) for r in review_cleans]

In [11]:
print(sentences[0:2])
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.backend import eval
from keras.optimizers import Adam
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D,MaxPooling1D
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, y.values, test_size=0.20, random_state=42)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


['i order of them and one of the item is bad quality is missing backup spring so i have to put a pcs of aluminum to make the battery work', 'bulk is always the less expensive way to go for products like these']


In [12]:
from numpy import array
from numpy import asarray
from numpy import zeros

# using GLOVE word embeddings
embeddings_dictionary = dict()

glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [13]:
from keras.layers import Input
from keras.models import Model

deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128, return_sequences=True)(embedding_layer)
LSTM_Layer_2 = LSTM(128, return_sequences=False)(LSTM_Layer_1)
#adding a dense layer with activation function of relu
dense_layer_2 = Dense(5, activation='sigmoid')(LSTM_Layer_2)
model = Model(inputs=deep_inputs, outputs=dense_layer_2)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(X_train, y_train, batch_size=64, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff2232b4890>

In [22]:
score = model.evaluate(X_test, y_test)
print("Test accuracy: %0.4f%%" % (score[1]*100))


Test accuracy: 70.4959%


## II RNN approach (removed stopwords)

In [48]:
# with stopwords
df = pd.read_csv('amazon_review_dataset.csv') #renamed
df = df[df['reviews.rating'].notnull() & df['reviews.text'].notnull()]
df= df[['reviews.rating','reviews.text']]

df.rename(columns = {'reviews.rating':'rating', 'reviews.text':'text'}, inplace = True)
df['text'][0]
cleaned = df['text']
print(cleaned[0])
stops = stopwords.words('english')

remove_punc = re.compile('[^a-z]+')
def clean_text(sent):
    sent = str(sent).lower()
    sent = remove_punc.sub(' ', sent).strip()
    filtered = [word for word in sent.split() if word not in stops]
    sentence = " ".join(filtered) #just joined -> laused
  
    #sentence = nltk.word_tokenize(sentence) #tokenized -> listid
    return sentence


clean_text(df['text'][0])
reviews = [clean_text(sent) for sent in df['text']]

I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.


In [49]:
df['text'] = reviews
df.head()


Unnamed: 0,rating,text
0,3,order one item bad quality missing backup spri...
1,4,bulk always less expensive way go products like
2,5,well duracell price happy
3,5,seem work well name brand batteries much bette...
4,5,batteries long lasting price great


In [50]:
y = pd.get_dummies(df.rating, prefix='rating_')
def clean_document(doco):
    punctuation = string.punctuation + '\n\n';
    punc_replace = ''.join([' ' for s in punctuation]);
    doco_clean = doco.replace('-', ' ');
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = str.maketrans(punctuation, punc_replace);
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
    doco_clean = doco_clean.split(' ');
    doco_clean = [word.lower() for word in doco_clean if len(word) > 0];
    
    return doco_clean;

# Generate a cleaned reviews array from original review texts
#review_cleans = [clean_document(doc) for doc in reviews];
#sentences = [' '.join(r) for r in review_cleans]
sentences = [' '.join(r) for r in reviews]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(reviews, y.values, test_size=0.20, random_state=42)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [54]:
# using GLOVE word embeddings
embeddings_dictionary = dict()

glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [55]:
from keras.layers import Input
from keras.models import Model

deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128, return_sequences=True)(embedding_layer)
LSTM_Layer_2 = LSTM(128, return_sequences=False)(LSTM_Layer_1)
#adding a dense layer with activation function of relu
dense_layer_2 = Dense(5, activation='sigmoid')(LSTM_Layer_2)
model = Model(inputs=deep_inputs, outputs=dense_layer_2)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(X_train, y_train, batch_size=64, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff25f2e8fd0>

In [56]:
score = model.evaluate(X_test, y_test)
print("Test accuracy: %0.4f%%" % (score[1]*100))

Test accuracy: 70.4959%
