In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, LSTM, Embedding, Flatten, GRU
from keras.utils import to_categorical
%matplotlib inline
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
from google.colab import drive
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import random

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd ..
%cd /content/drive/MyDrive
%pwd

/
/content/drive/MyDrive


'/content/drive/MyDrive'

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# import the data (chunksize returns jsonReader for iteration)
businesses = pd.read_json("yelp_academic_dataset_business.json", lines=True, orient='columns', chunksize=100000)
reviews = pd.read_json("yelp_academic_dataset_review.json", lines=True, orient='columns', chunksize=100000)

In [None]:
# read the data
for business in businesses:
    business_chunk = business
    break

for review in reviews:
    review_chunk = review
    break

In [None]:
display(business_chunk.head(2))
display(review_chunk.head(2))

In [None]:
a = business_chunk[business['categories'].str.contains('Restaurant') == True]
rev = review_chunk[review_chunk.business_id.isin(a['business_id']) == True]
rev.shape

(72125, 9)

In [None]:
rev['stars'].head()

0    3
2    3
3    5
4    4
5    1
Name: stars, dtype: int64

In [None]:
sample_size = 35000
rev_sample = rev.sample(n=sample_size, random_state=42)
rev_score1 = rev[rev['stars'].isin([1,5])]
rev_score2 = rev[rev['stars'].isin([2,3,4])]
rev_score1_sample = rev_score1.sample(n=sample_size, random_state=12)
rev_score2_sample = rev_score2.sample(n=sample_size, random_state=22)
rev_score1_sample.shape, rev_score2_sample.shape

((35000, 9), (35000, 9))

In [None]:
maxlen = 500
review_cnt_limit = 15000

# import GLoVE embeddings
embedding_file = 'glove.6B.300d.txt'

# read in embeddings
def load_glove_embeddings(embedding_file_path):
    embeddings_index = {}
    with open(embedding_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(embedding_file)

In [None]:
glove_embeddings

In [None]:
def clean_and_tokenize(string):
    lem = WordNetLemmatizer()
    return [lem.lemmatize(word) for word in word_tokenize(re.sub(r"[^\w\s]", "", string))]

In [None]:
rev_sample = rev_sample[['text','stars']]
rev_score1_sample = rev_score1_sample[['text','stars']]
rev_score2_sample = rev_score2_sample[['text','stars']]

In [None]:
rev_sample["lemmatized_text"] = rev_sample["text"].apply(clean_and_tokenize)
rev_score1_sample["lemmatized_text"] = rev_score1_sample["text"].apply(clean_and_tokenize)
rev_score2_sample["lemmatized_text"] = rev_score2_sample["text"].apply(clean_and_tokenize)

In [None]:
# this is lemmatized data for original score
rev_lemmatized = rev_sample[['lemmatized_text', 'stars']]
tokenizer = Tokenizer(num_words=review_cnt_limit)
tokenizer.fit_on_texts(rev_lemmatized['lemmatized_text'])
sequence = tokenizer.texts_to_sequences(rev_lemmatized['lemmatized_text'])
padded_sequences = pad_sequences(sequence, maxlen=maxlen)

# this is lemmatized data for model 1, extreme score model
rev_lemmatized_extreme = rev_sample[['lemmatized_text', 'stars']]
tokenizer_extreme = Tokenizer(num_words=review_cnt_limit)
tokenizer_extreme.fit_on_texts(rev_lemmatized_extreme['lemmatized_text'])
sequence_extreme = tokenizer_extreme.texts_to_sequences(rev_lemmatized_extreme['lemmatized_text'])
padded_sequences_extreme = pad_sequences(sequence_extreme, maxlen=maxlen)

# this is the lemaatized data for model 2, 1&5 start score model
rev_lemmatized_15 = rev_score1_sample[['lemmatized_text', 'stars']]
tokenizer_15 = Tokenizer(num_words=review_cnt_limit)
tokenizer_15.fit_on_texts(rev_lemmatized_15['lemmatized_text'])
sequence_15 = tokenizer_15.texts_to_sequences(rev_lemmatized_15['lemmatized_text'])
padded_sequences_15 = pad_sequences(sequence_15, maxlen=maxlen)

# this is the lemaatized data for model 3, 2&3&4 start score model
rev_lemmatized_234 = rev_score2_sample[['lemmatized_text', 'stars']]
tokenizer_234 = Tokenizer(num_words=review_cnt_limit)
tokenizer_234.fit_on_texts(rev_lemmatized_234['lemmatized_text'])
sequence_234 = tokenizer_234.texts_to_sequences(rev_lemmatized_234['lemmatized_text'])
padded_sequences_234 = pad_sequences(sequence_234, maxlen=maxlen)

In [None]:
rev_lemmatized_extreme['extreme_rate'] = rev_lemmatized_extreme['stars'].map(lambda x: 1 if x in [1, 5] else 0)
rev_lemmatized_extreme = rev_lemmatized_extreme[['lemmatized_text','extreme_rate']]

In [None]:
rev_lemmatized_extreme.head(20)

Unnamed: 0,lemmatized_text,extreme_rate
22934,"[Hey, kid, did, you, know, that, Vietnam, wa, ...",0
31476,"[If, you, think, you, are, walking, into, a, t...",1
33033,"[This, place, is, pretty, good, A, little, on,...",0
72060,"[Enjoy, a, taste, of, classic, Nashville, but,...",0
41227,"[Not, closed, They, have, temporary, hour, rig...",0
97648,"[An, exquisite, and, delicious, Parisian, esca...",1
37395,"[Solid, 35, star, I, wa, pleasantly, surprised...",0
65991,"[First, impression, bad, restaurant, hostess, ...",1
74883,"[Sweet, and, Sassy, seems, to, be, a, regular,...",1
12681,"[Love, this, place, their, green, passion, smo...",1


In [None]:
rev_lemmatized_extreme.shape

(35000, 2)

In [None]:
# modification made to encoding
# we want to split score into two group 1&5, 2,3 and 4 by adding a new feature
# we noticed 1&5 score have higher accuracy so we want seperate them out and train a model solely on 2,3,4
label_encoder = LabelEncoder()
# this is the original y for training on scores
y = label_encoder.fit_transform(rev_lemmatized['stars'])

# the extreme score is to differentiate between 1&5 star and 2,3,4 stars
y_extreme = label_encoder.fit_transform(rev_lemmatized_extreme['extreme_rate'])

# the score is to predict the actual rating
y_score_15 = label_encoder.fit_transform(rev_lemmatized_15['stars'])
y_score_234 = label_encoder.fit_transform(rev_lemmatized_234['stars'])

In [None]:
y = to_categorical(y)
y_extreme = to_categorical(y_extreme)
y_score_15 = to_categorical(y_score_15)
y_score_234 = to_categorical(y_score_234)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size = 0.3, random_state=42)
X_train_extreme, X_test_extreme, y_train_extreme, y_test_extreme = train_test_split(padded_sequences_extreme, y_extreme, test_size = 0.3, random_state=52)
X_train_15, X_test_15, y_train_15, y_test_15 = train_test_split(padded_sequences_15, y_score_15, test_size = 0.4, random_state=100)
X_train_234, X_test_234, y_train_234, y_test_234 = train_test_split(padded_sequences_234, y_score_234, test_size = 0.4, random_state=101)

In [None]:
# Finding the assigned vector for each word in the embedding and assign it back to words
# Original score
word_index = tokenizer.word_index
num_words = min(review_cnt_limit,len(word_index))+1
embedding_dimension = 300
embedding_matrix = np.zeros((num_words, embedding_dimension))
for word, i in tokenizer.word_index.items():
    if i > review_cnt_limit:
        continue
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Extreme Score
word_index_extreme = tokenizer_extreme.word_index
num_words_extreme = min(review_cnt_limit,len(word_index_extreme))+1
embedding_matrix_extreme = np.zeros((num_words_extreme, embedding_dimension))
for word, i in tokenizer_extreme.word_index.items():
    if i > review_cnt_limit:
        continue
    embedding_vector_extreme = glove_embeddings.get(word)
    if embedding_vector_extreme is not None:
        embedding_matrix_extreme[i] = embedding_vector_extreme

# 1 star & 5 star score
word_index_15 = tokenizer_15.word_index
num_words_15 = min(review_cnt_limit,len(word_index_15))+1
embedding_matrix_15 = np.zeros((num_words_15, embedding_dimension))
for word, i in tokenizer_15.word_index.items():
    if i > review_cnt_limit:
        continue
    embedding_vector_15 = glove_embeddings.get(word)
    if embedding_vector_15 is not None:
        embedding_matrix_15[i] = embedding_vector_15

# 2, 3, 4 star score
word_index_234 = tokenizer_234.word_index
num_words_234 = min(review_cnt_limit,len(word_index_234))+1
embedding_matrix_234 = np.zeros((num_words_234, embedding_dimension))
for word, i in tokenizer_234.word_index.items():
    if i > review_cnt_limit:
        continue
    embedding_vector_234 = glove_embeddings.get(word)
    if embedding_vector_234 is not None:
        embedding_matrix_234[i] = embedding_vector_234

In [None]:
# We will fit 3 models, the first model to predict whether the rating is extreme rating or not
# second model predict the score between 1 & 5
# third model predict the score between 2,3,4
# using the same setting for my neural network models, but fit different label and input variables
model = Sequential()
model.add(Embedding(num_words, embedding_dimension, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(LSTM(40))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))

model_extreme = Sequential()
model_extreme.add(Embedding(num_words_extreme, embedding_dimension, input_length=maxlen, weights=[embedding_matrix_extreme], trainable=False))
model_extreme.add(LSTM(40))
model_extreme.add(Dense(32, activation='relu'))
model_extreme.add(Dense(2, activation='sigmoid'))


model_15 = Sequential()
model_15.add(Embedding(num_words_15, embedding_dimension, input_length=maxlen, weights=[embedding_matrix_15], trainable=False))
model_15.add(LSTM(40))
model_15.add(Dense(32, activation='relu'))
model_15.add(Dense(2, activation='softmax'))

model_234 = Sequential()
model_234.add(Embedding(num_words_234, embedding_dimension, input_length=maxlen, weights=[embedding_matrix_234], trainable=False))
model_234.add(LSTM(40))
model_234.add(Dense(32, activation='relu'))
model_234.add(Dense(len(label_encoder.classes_), activation='softmax'))

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_extreme.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_15.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_234.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, validation_split=0.2, verbose=1, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d474eaffd90>

In [None]:
# This is the performance of the original NN model
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy score is ", accuracy, "Loss is ", loss)

Accuracy score is  0.6060000061988831 Loss is  0.9502753615379333


In [None]:
model_extreme.fit(X_train_extreme, y_train_extreme, epochs=10, validation_split=0.2, verbose=1, batch_size=64, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.src.callbacks.History at 0x7d475c27dcf0>

In [None]:
# This is the performance of the original NN model
loss, accuracy = model_extreme.evaluate(X_test_extreme, y_test_extreme)
print("Accuracy score is ", accuracy, "Loss is ", loss)

Accuracy score is  0.7209523916244507 Loss is  0.5546982884407043


In [None]:
# We also tested the accuracy of a Naive Bayes model
# seems that with the NB model has better performance in predicting extreme scores than our NN model
X_nb, y_nb = rev_lemmatized_extreme['lemmatized_text'].apply(lambda x:" ".join(x)) ,rev_lemmatized_extreme['extreme_rate']
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_nb, y_nb, test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_nb_vector_data = vectorizer.fit_transform(X_train_nb)
X_test_nb_vector_data = vectorizer.transform(X_test_nb)

In [None]:
model_extreme_nb = MultinomialNB()
model_extreme_nb.fit(X_train_nb_vector_data,y_train_nb)

In [None]:
y_pred_nb = model_extreme_nb.predict(X_test_nb_vector_data)
accuracy = accuracy_score(y_test_nb, y_pred_nb)
accuracy

0.7364285714285714

In [None]:
model_15.fit(X_train_15, y_train_15, epochs=10, validation_split=0.2, verbose=1, batch_size=64)

In [None]:
model_234.fit(X_train_234, y_train_234, epochs=10, validation_split=0.2, verbose=1, batch_size=64)

In [None]:
# The following code is combining three Neural Network models to predict new dataset
# Although we have seperate ratings and model_15, model_234 showed significant improvement over the accuracy
# Combined together with the first neural network to predict extreme score will lower the overall accuracy
# due to the fact that error will multiply with each other

df_validate = rev # can be replaced with any other new review data as long as you import it as a dataframe and it contains ['text'] column

In [None]:
df_validate["lemmatized_text"] = df_validate["text"].apply(clean_and_tokenize)
df_validate = df_validate[["lemmatized_text",'stars']]
df_validate.head()

In [None]:
tokenizer_val = Tokenizer(num_words=review_cnt_limit)
tokenizer_val.fit_on_texts(df_validate["lemmatized_text"])
sequence_val = tokenizer_val.texts_to_sequences(df_validate["lemmatized_text"])
validate_extreme = pad_sequences(sequence_val, maxlen=maxlen)
prediction_val = model_extreme.predict(validate_extreme)

In [None]:
threshold = 0.5
binary_predictions = [1 if prob[1] > threshold else 0 for prob in prediction_val]

In [None]:
df_validate['Predicted_Extreme_Score'] = binary_predictions
df_validate.head()

In [None]:
df_15 = df_validate[df_validate['Predicted_Extreme_Score'] == 1]
df_15.shape

In [None]:
tokenizer_val_15 = Tokenizer(num_words=review_cnt_limit)
tokenizer_val_15.fit_on_texts(df_15["lemmatized_text"])
sequence_val_15 = tokenizer_val_15.texts_to_sequences(df_15["lemmatized_text"])
validate_15 = pad_sequences(sequence_val_15, maxlen=maxlen)
prediction_15 = model_15.predict(validate_15)

In [None]:
one_five_predictions = [5 if prob[1] > threshold else 1 for prob in prediction_15]
df_15['Predicted_Score'] = one_five_predictions
df_15.head(20)

In [None]:
df_234 = df_validate[df_validate['Predicted_Extreme_Score'] == 0]
df_234.shape

In [None]:
tokenizer_val_234 = Tokenizer(num_words=review_cnt_limit)
tokenizer_val_234.fit_on_texts(df_234["lemmatized_text"])
sequence_val_234 = tokenizer_val_234.texts_to_sequences(df_234["lemmatized_text"])
validate_234 = pad_sequences(sequence_val_234, maxlen=maxlen)
prediction_234 = model_234.predict(validate_234)

In [None]:
prediction_234 = np.argmax(prediction_234, axis=1)

In [None]:
df_234['Predicted_Score'] = prediction_234
df_234['Predicted_Score'] = df_234['Predicted_Score'].replace(2,4)
df_234['Predicted_Score'] = df_234['Predicted_Score'].replace(1,3)
df_234['Predicted_Score'] = df_234['Predicted_Score'].replace(0,2)
df_234.head(20)

In [None]:
df_result = pd.concat([df_15, df_234], ignore_index=True)
df_result.head(20)

In [None]:
correct_predictions = (df_result['Predicted_Score'] == df_result['stars']).sum()
total_predictions = len(df_result)

accuracy = correct_predictions / total_predictions
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
loss, accuracy = model_extreme.evaluate(X_test, y_test)
print("Accuracy score is ", accuracy, "Loss is ", loss)