<a href="https://colab.research.google.com/github/nommrichard/prod_rating/blob/main/rnn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting product rating based on review text 

## Project in LTAT.01.001 Natural language processing

#### Team members: Karl Jaagup Kask, Ludvig Leis, Richard Nõmm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


!python3 -m nltk.downloader stopwords
!python3 -m nltk.downloader punkt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
RANDOM_SEED = 100

The data we are using: https://www.kaggle.com/datafiniti/consumer-reviews-of-amazon-products

## Data preprocessing (data file needs to be imported)

In [None]:
df = pd.read_csv('amazon_review_dataset.csv') #renamed

In [None]:
df = df[df['reviews.rating'].notnull() & df['reviews.text'].notnull()]
df= df[['reviews.rating','reviews.text']]

df.rename(columns = {'reviews.rating':'rating', 'reviews.text':'text'}, inplace = True)
df['text'][0]
cleaned = df['text']
print(cleaned[0])

I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.


In [None]:

stops = stopwords.words('english')

remove_punc = re.compile('[^a-z]+')
def clean_text(sent):
    sent = str(sent).lower()
    sent = remove_punc.sub(' ', sent).strip()
    filtered = [word for word in sent.split()]
    sentence = " ".join(filtered) #just joined -> laused
  
    #sentence = nltk.word_tokenize(sentence) #tokenized -> listid
    return sentence


clean_text(df['text'][0])


'i order of them and one of the item is bad quality is missing backup spring so i have to put a pcs of aluminum to make the battery work'

In [None]:
reviews = [clean_text(sent) for sent in df['text']]

In [None]:
df.head()

Unnamed: 0,rating,text
0,3,I order 3 of them and one of the item is bad q...
1,4,Bulk is always the less expensive way to go fo...
2,5,Well they are not Duracell but for the price i...
3,5,Seem to work as well as name brand batteries a...
4,5,These batteries are very long lasting the pric...


## I CNN approach

In [None]:
y = pd.get_dummies(df.rating, prefix='rating_')
y.head()

Unnamed: 0,rating__1,rating__2,rating__3,rating__4,rating__5
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [None]:
import string

def clean_document(doco):
    punctuation = string.punctuation + '\n\n';
    punc_replace = ''.join([' ' for s in punctuation]);
    doco_clean = doco.replace('-', ' ');
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = str.maketrans(punctuation, punc_replace);
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
    doco_clean = doco_clean.split(' ');
    doco_clean = [word.lower() for word in doco_clean if len(word) > 0];
    
    return doco_clean;

# Generate a cleaned reviews array from original review texts
review_cleans = [clean_document(doc) for doc in reviews];
sentences = [' '.join(r) for r in review_cleans]

In [None]:
print(sentences[0:2])
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.backend import eval
from keras.optimizers import Adam
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D,MaxPooling1D
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, y.values, test_size=0.20, random_state=42)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


['order one item bad quality missing backup spring put pcs aluminum make battery work', 'bulk always less expensive way go products like']


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
import tensorflow as tf
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

model = Sequential()      # initilaizing the Sequential nature for CNN model
model.add(Embedding(5000, 32, input_length=maxlen))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(5, activation='sigmoid'))

'''model = Sequential()
model.add(Embedding(5000, 100, input_length=maxlen))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='sigmoid'))'''

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=128, verbose=2, callbacks=[es])

Epoch 1/5
178/178 - 10s - loss: 0.3060 - accuracy: 0.7002 - val_loss: 0.2647 - val_accuracy: 0.7233
Epoch 2/5
178/178 - 9s - loss: 0.2375 - accuracy: 0.7467 - val_loss: 0.2375 - val_accuracy: 0.7521
Epoch 3/5
178/178 - 9s - loss: 0.1961 - accuracy: 0.7970 - val_loss: 0.2255 - val_accuracy: 0.7702
Epoch 4/5
178/178 - 9s - loss: 0.1636 - accuracy: 0.8356 - val_loss: 0.2191 - val_accuracy: 0.7860
Epoch 5/5
178/178 - 9s - loss: 0.1314 - accuracy: 0.8734 - val_loss: 0.2219 - val_accuracy: 0.7992
Epoch 00005: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f3ecd6a0a50>

In [None]:
score = model.evaluate(X_test, y_test)
print("Test accuracy: %0.4f%%" % (score[1]*100))


Test accuracy: 79.9188%


## II CNN approach (removed stopwords)

In [None]:
# with stopwords
df = pd.read_csv('amazon_review_dataset.csv') #renamed
df = df[df['reviews.rating'].notnull() & df['reviews.text'].notnull()]
df= df[['reviews.rating','reviews.text']]

df.rename(columns = {'reviews.rating':'rating', 'reviews.text':'text'}, inplace = True)
df['text'][0]
cleaned = df['text']
print(cleaned[0])
stops = stopwords.words('english')

remove_punc = re.compile('[^a-z]+')
def clean_text(sent):
    sent = str(sent).lower()
    sent = remove_punc.sub(' ', sent).strip()
    filtered = [word for word in sent.split() if word not in stops]
    sentence = " ".join(filtered) #just joined -> laused
  
    #sentence = nltk.word_tokenize(sentence) #tokenized -> listid
    return sentence


clean_text(df['text'][0])
reviews = [clean_text(sent) for sent in df['text']]

I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.


In [None]:
df['text'] = reviews
df.head()


Unnamed: 0,rating,text
0,3,order one item bad quality missing backup spri...
1,4,bulk always less expensive way go products like
2,5,well duracell price happy
3,5,seem work well name brand batteries much bette...
4,5,batteries long lasting price great


In [None]:
y = pd.get_dummies(df.rating, prefix='rating_')
def clean_document(doco):
    punctuation = string.punctuation + '\n\n';
    punc_replace = ''.join([' ' for s in punctuation]);
    doco_clean = doco.replace('-', ' ');
    doco_alphas = re.sub(r'\W +', '', doco_clean)
    trans_table = str.maketrans(punctuation, punc_replace);
    doco_clean = ' '.join([word.translate(trans_table) for word in doco_alphas.split(' ')]);
    doco_clean = doco_clean.split(' ');
    doco_clean = [word.lower() for word in doco_clean if len(word) > 0];
    
    return doco_clean;

# Generate a cleaned reviews array from original review texts
#review_cleans = [clean_document(doc) for doc in reviews];
#sentences = [' '.join(r) for r in review_cleans]
sentences = [' '.join(r) for r in reviews]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews, y.values, test_size=0.20, random_state=42)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
import tensorflow as tf

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

model = Sequential()      # initilaizing the Sequential nature for CNN model
model.add(Embedding(5000, 32, input_length=maxlen))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(5, activation='sigmoid'))

'''model = Sequential()
model.add(Embedding(5000, 100, input_length=maxlen))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='sigmoid'))'''

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128, verbose=2, callbacks=[es])

Epoch 1/10
178/178 - 10s - loss: 0.3040 - accuracy: 0.7009 - val_loss: 0.2662 - val_accuracy: 0.7238
Epoch 2/10
178/178 - 9s - loss: 0.2352 - accuracy: 0.7492 - val_loss: 0.2394 - val_accuracy: 0.7514
Epoch 3/10
178/178 - 9s - loss: 0.1962 - accuracy: 0.7984 - val_loss: 0.2255 - val_accuracy: 0.7683
Epoch 4/10
178/178 - 9s - loss: 0.1623 - accuracy: 0.8371 - val_loss: 0.2199 - val_accuracy: 0.7867
Epoch 5/10
178/178 - 9s - loss: 0.1312 - accuracy: 0.8739 - val_loss: 0.2208 - val_accuracy: 0.8004
Epoch 00005: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f3ecd6fce10>

In [None]:
score = model.evaluate(X_test, y_test)
print("Test accuracy: %0.4f%%" % (score[1]*100))

Test accuracy: 80.0424%
