In [None]:
import numpy as np
import pandas as pd
from google.colab import files
import io
import json
import random
import nltk
import re
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import array
from numpy import argmax
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense
from keras.layers import Flatten
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import r2_score, mean_squared_error
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import LSTM

# Data Preparation
Upload the dataset from Kaggle 

https://www.kaggle.com/jkgatt/restaurant-data-with-100-trip-advisor-reviews-each?select=factual_tripadvisor_restaurant_data_all_100_reviews.json

In [None]:
uploaded = files.upload()
dataset = json.load(io.BytesIO(uploaded["factual_tripadvisor_restaurant_data_all_100_reviews.json"]), encoding = "utf-8")

Saving factual_tripadvisor_restaurant_data_all_100_reviews.json to factual_tripadvisor_restaurant_data_all_100_reviews.json


The dataset includes detail information of 147 restaurants, such as restaurant name, address, and phone number. For this project, we only focus on text reviews and numurial rating scores. Therefore, the customer reviews and ratings data for each restaurant are fetched and stored in a dataframe.

Each restaurant has 100 customer rating scores and reviews. The *restaurant_id* in the dataframe *data* indicates which restaurant that review/score belongs to. 

In [None]:
reviews = []
ratings = []
restaurant_id = []
num_of_restaurant = 147

for i in range(num_of_restaurant):
    score = 0
    text = ""
    count = 0
    for review in dataset["restaurants"][i]["reviews"]:
        reviews.append(review["review_text"])
        ratings.append(review["review_rating"])
        restaurant_id.append(i)
        count = count + 1

data = pd.DataFrame(list(zip(ratings, reviews, restaurant_id)), columns =['Rating', 'Reviews', 'Restaurant_id']) 
data

Unnamed: 0,Rating,Reviews,Restaurant_id
0,4,They have great local craft beers and probably...,0
1,4,We went to the downtown SF location. The resta...,0
2,4,I just came to this place for drinks with an o...,0
3,3,"Mediocre food (not bad, just mediocre, you can...",0
4,4,We headed out for our team dinner to this esta...,0
...,...,...,...
14695,5,Try the chicken and waffles or the biscuit san...,146
14696,3,We have eaten here many times and have always ...,146
14697,5,The restaurant is hard to find unless you know...,146
14698,4,I stayed at the Farmer's Daughter hotel from A...,146


The dataset is divided into a training set and a testing set. The testing set includes the reviews/scores of 30 randomly chosen restaurants. The rest of review data are in the training set, which will be used to train the nerual network. 

In [None]:
train_data = pd.DataFrame(columns={"Rating","Reviews","Restaurant_id"}) 
test_data = pd.DataFrame(columns={"Rating","Reviews","Restaurant_id"}) 

selected_restaurant = random.sample(list(np.unique(data["Restaurant_id"])), int(num_of_restaurant*0.21))
for i in range(num_of_restaurant):
  if i not in selected_restaurant:
    train_data = train_data.append(data.loc[data['Restaurant_id'] == i])
  else:
    test_data = test_data.append(data.loc[data['Restaurant_id'] == i])

The next step is to preprocess the textual review data. Digit, punctuation, multiple space, and stop words are filtered out. And all letters are in lowercase.

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

def text_preprocess(review):
  review = re.sub(r'[^\x00-\x7F]+',' ', review)
  # no digit
  remove_digits = str.maketrans('', '', string.digits)
  review = review.translate(remove_digits)
  # no punctuation
  review = review.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
  # low case
  review = review.lower()
  # no multiple space
  review = re.sub(r'\s+', ' ', review)
  # no stop words
  bagOfWords = review.split()
  bagOfWords = [w for w in bagOfWords if not w in stop_words] 
  review = " ".join(bagOfWords)
  
  return review

#clean train set
train_x = []
train_y = list(train_data["Rating"])
train_id = list(train_data["Restaurant_id"])
train_text = list(train_data["Reviews"])
for i in train_text:
  train_x.append(text_preprocess(i))

#clean test set
test_x = []
test_y = list(test_data["Rating"])
test_id = list(test_data["Restaurant_id"])
test_text = list(test_data["Reviews"])
for i in test_text:
  test_x.append(text_preprocess(i))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


The *Tokenizer* class in Keras is used to convert textual data into numerical data. And the length of each review string is set to be 100.

The *fit_on_texts* method gives each vocabulary in total 5000 words an index based on its frequency occured in training dataset. A word with lower index indicates that the frequency of this word in the training set is higher.

Then *texts_to_sequences* method transforms the training dataset and testing dataset into two sequences of integers by assigning the corresponding index value to each word.

In [None]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_x)

train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)

max = 100
train_x = pad_sequences(train_x, padding = 'post', maxlen = max)
test_x = pad_sequences(test_x, padding = 'post', maxlen = max)

# The Embedding Layer

GloVe stands for global vectors for word representation, which is an unsupervised learning algorithm obtains vector representations for words. The training process of GloVe is to aggregating global word-word co-occurence matrix from a corpus. [1] 

In this project, a pre-trained word vector is used in the embedding layer. Different dimension (50d, 100d, 200d, 300d) of pre-trained word vector are choosen and applied in the neural network. The prediction performance after applying different dimension pre-trained word vectors will be compared.

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-08-10 02:38:00--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-08-10 02:38:00--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-08-10 02:38:00--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [None]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
import os

glove_dir = './'

embeddings_index = {} 

#here we choose the 100d pre-trained word vector
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
embedding_dim = 100
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim)) #create an array of zeros with word_num rows and embedding_dim columns
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < num_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

# Feed Forward Neural Network

In [None]:
#convert the label into one-hot coding
train_y = array(train_y)
encoded_train_y = to_categorical(train_y)

test_y = array(test_y)
encoded_test_y = to_categorical(test_y)

Input Layer -> Embedding Layer -> Dense Layer with Sigmoid activation function 

In [None]:
model = Sequential()
layer = Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length = max, trainable = False)
model.add(layer)
model.add(Flatten())
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['acc'])
model.fit(train_x, encoded_train_y, batch_size = 200, epochs = 30, validation_split = 0.1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ffab8f39630>

In [None]:
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          1883400   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 60006     
Total params: 1,943,406
Trainable params: 60,006
Non-trainable params: 1,883,400
_________________________________________________________________
None


Because this is not a classification problem, *R2* and *mean squared error* are used to measure the rating prediction performance. 

In [None]:
predict_restaurant_y = []
actual_restaurant_y = []

encoded_predict_y = model.predict(test_x)
predict_y = np.argmax(encoded_predict_y, axis=1)

sort = np.sort(selected_restaurant)
for i in range(len(sort)):
  p_y = sum(predict_y[i*100:(i+1)*100])
  p_y = p_y/100
  predict_restaurant_y.append(p_y)
  a_y = sum(test_y[i*100:(i+1)*100])
  a_y = a_y/100
  actual_restaurant_y.append(a_y)

print("R2: "+ str(r2_score(test_y, predict_y)))
print("mean_squared_error: "+ str(mean_squared_error(test_y, predict_y)))

R2: -0.11385912554017152
mean_squared_error: 1.0773333333333333


# Convolutional Neural Network

Input Layer -> Embedding Layer -> Convolutional Layer -> Pooling Layer -> Dense Layer with Sigmoid activation function 

In [None]:
model = Sequential()
layer = Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length = max, trainable = False)
model.add(layer)
model.add(Conv1D(embedding_dim, 5, activation = "relu"))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['acc'])
model.fit(train_x, encoded_train_y, batch_size = 200, epochs = 30, validation_split = 0.1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ffab757a550>

In [None]:
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          1883400   
_________________________________________________________________
conv1d (Conv1D)              (None, 96, 100)           50100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 606       
Total params: 1,934,106
Trainable params: 50,706
Non-trainable params: 1,883,400
_________________________________________________________________
None


Calculate the prediction performance

In [None]:
predict_restaurant_y = []
actual_restaurant_y = []

encoded_predict_y = model.predict(test_x)
predict_y = np.argmax(encoded_predict_y, axis=1)

sort = np.sort(selected_restaurant)
for i in range(len(sort)):
  p_y = sum(predict_y[i*100:(i+1)*100])
  p_y = p_y/100
  predict_restaurant_y.append(p_y)
  a_y = sum(test_y[i*100:(i+1)*100])
  a_y = a_y/100
  actual_restaurant_y.append(a_y)

print("R2: "+ str(r2_score(test_y, predict_y)))
print("mean_squared_error: "+ str(mean_squared_error(test_y, predict_y)))

R2: 0.12600657723704356
mean_squared_error: 0.8453333333333334


# Recurrenct Neural Network (LSTM)

Input Layer -> Embedding Layer -> LSTM Layer with 100 neurons -> Dense Layer with Sigmoid activation function 

In [None]:
model = Sequential()
layer = Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length = max, trainable = False)
model.add(layer)
model.add(LSTM(100, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(6, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['acc'])
model.fit(train_x, encoded_train_y, batch_size = 200, epochs = 30, validation_split = 0.1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ffab5b7a668>

In [None]:
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          1883400   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 606       
Total params: 1,964,406
Trainable params: 81,006
Non-trainable params: 1,883,400
_________________________________________________________________
None


Calculate the prediction performance

In [None]:
predict_restaurant_y = []
actual_restaurant_y = []

encoded_predict_y = model.predict(test_x)
predict_y = np.argmax(encoded_predict_y, axis=1)
sort = np.sort(selected_restaurant)
for i in range(len(sort)):
  p_y = sum(predict_y[i*100:(i+1)*100])
  p_y = p_y/100
  predict_restaurant_y.append(p_y)
  a_y = sum(test_y[i*100:(i+1)*100])
  a_y = a_y/100
  actual_restaurant_y.append(a_y)

print("R2: "+ str(r2_score(test_y, predict_y)))
print("mean_squared_error: "+ str(mean_squared_error(test_y, predict_y)))

R2: 0.3417478558843664
mean_squared_error: 0.6366666666666667


# Reference
[1]J. Pennington, GloVe: Global Vectors for Word Representation. [Online]. Available: https://nlp.stanford.edu/projects/glove/.