In [1]:
import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import random


nlp=spacy.load('en_core_web_sm')

In [2]:
df=pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [4]:
#sample reviews in df2
df2=df.sample(1000, random_state=42)
df2=df2.reset_index(drop=True)
df2.info()
                                       
                                         

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Rating  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
df2['Rating'].value_counts()

Rating
5    453
4    294
3    110
2     77
1     66
Name: count, dtype: int64

In [6]:
#stratified sampling
df2=df.groupby('Rating', group_keys=False).apply(lambda x: x.sample(min(len(x), 200), random_state=42))
df2=df2.reset_index(drop=True)
df2.info()
df2['Rating'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Rating  1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


  df2=df.groupby('Rating', group_keys=False).apply(lambda x: x.sample(min(len(x), 200), random_state=42))


Rating
1    200
2    200
3    200
4    200
5    200
Name: count, dtype: int64

In [7]:
#build vocabulary
doc=nlp(' '.join(df2['Review'].astype(str).tolist()))

In [8]:
# steps to build frequency based classifier tf idf
# 1. create a vocab of words
# 2. preprocess the text - lower case, remove stop words, lemmatization, remove punctuations, remove numbers, remove special characters, remove extra spaces
# 3. create a document term matrix
# 4. calculate tf idf
# 5. build a classifier - logistic regression, naive bayes, decision tree, random forest, xgboost

In [9]:
#preprocessing
# 1. lower case
# 2. remove stop words
# 3. lemmatization
# 4. remove punctuations
# 5. remove numbers
# 6. remove special characters
# 7. remove extra spaces

In [10]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [11]:
#lower case
def lower_case(text):
    return text.lower()


In [12]:
#remove stop words
def remove_stop_words(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return ' '.join(tokens)


In [13]:
#lemmatization
def lemmatization(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]
    return ' '.join(tokens)


In [14]:
#remove punctuations
def remove_punctuations(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_punct]
    return ' '.join(tokens)


In [15]:
#remove spcial characters and spaces
def remove_special_characters(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha]
    return ' '.join(tokens)


In [16]:
#single preprocessing function
def preprocess_text(text):
    text = lower_case(text)
    text = remove_stop_words(text)
    text = lemmatization(text)
    text = remove_punctuations(text)
    text = remove_special_characters(text)
    return text

df2['Processed_review'] = df2['Review'].astype(str).apply(preprocess_text)
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [17]:
#create document term matrix
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorised = vectorizer.fit_transform(df2['Processed_review'])
vectorised.toarray()
vectorised.shape
y = df2['Rating']

In [18]:
#tf idf
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(vectorised)
X.toarray()
X.shape

(1000, 9000)

In [19]:
#classification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df2.head(10)

Unnamed: 0,Review,Rating,Processed_review
0,"old rude blah reason stayed jacuzzi suite, wis...",1,old rude blah reason stay jacuzzi suite wish s...
1,"hated, just got punta cana stayed melia caribe...",1,hate get punta cana stay melia caribe want sit...
2,absolutely horrible stayed 30th august jolly c...,1,absolutely horrible stay august jolly carlton ...
3,"problems way stayed 2 nights, property not bad...",1,problem way stay night property bad stay room ...
4,tourists stay away bad address unless want roc...,1,tourist stay away bad address want rock concer...
5,"3-star service, friends stayed 6 nights nov. p...",1,star service friend stay night nov pretty bad ...
6,"dump, christmas vacation family brother family...",1,dump christmas vacation family brother family ...
7,"think 3 times choose hotel, recently took week...",1,think time choose hotel recently take week lon...
8,"rip, self husband just returned honeymoon even...",1,rip self husband return honeymoon evening wait...
9,"n't stay, youth hostel, boyfriend stayed night...",1,stay youth hostel boyfriend stay night hotel l...


In [20]:
df2.iloc[100]
df2.iloc[100]['Processed_review']


'mediocre good intercontinental material wow dissapointment usually rest assure intercontinental location high order case intercontinental dallas positive entrance impressive location close galleria numerous place eat shop recommend hotel intercontinental ashamed location elevator old stale cigarette smoke odor noticeable hallway clean appear ok look renovated little attention crown molding door look cheap wall paper thin enter room hear peron room door urinate right urinating extremely loud hallway feel like person restroom room mediocre well bed horrible usually intercontinental superb bed featherbed comforter location run hotel bed cheap mattress cheaper bed stay holiday inn expresses own intercontinental hotel nice sumptuous room bed bathroom lackluster old fixture outdate shower disappointing towel horrific kid towel holiday inn express nice location cheap order day intercontinental towel extremely disappointed location stay sound like nitpick complainer price chump book room stra

In [21]:
#random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')



[[27  2  1  1  2]
 [24  2 14  7  1]
 [ 7  1 12 13  4]
 [ 5  3  4 11 15]
 [ 4  1  2  9 28]]
              precision    recall  f1-score   support

           1       0.40      0.82      0.54        33
           2       0.22      0.04      0.07        48
           3       0.36      0.32      0.34        37
           4       0.27      0.29      0.28        38
           5       0.56      0.64      0.60        44

    accuracy                           0.40       200
   macro avg       0.36      0.42      0.37       200
weighted avg       0.36      0.40      0.35       200

Accuracy: 0.40


In [22]:
#naives bayes classifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
nb = GaussianNB()
nb.fit(X_train.toarray(), y_train)
y_pred_nb = nb.predict(X_test.toarray())
y_pred_nb
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Accuracy: {accuracy_nb:.2f}')


[[13  9  8  1  2]
 [19  9 13  5  2]
 [ 8  9 13  4  3]
 [ 6  3  9 11  9]
 [ 6  8 12  8 10]]
              precision    recall  f1-score   support

           1       0.25      0.39      0.31        33
           2       0.24      0.19      0.21        48
           3       0.24      0.35      0.28        37
           4       0.38      0.29      0.33        38
           5       0.38      0.23      0.29        44

    accuracy                           0.28       200
   macro avg       0.30      0.29      0.28       200
weighted avg       0.30      0.28      0.28       200

Accuracy: 0.28


In [None]:
new_review = ["The hotel was okay. Staff were friendly. Food was great"]

#use the same preprocessing steps on the new review
new_review = [lower_case(review) for review in new_review]
new_review = [remove_stop_words(review) for review in new_review]
new_review = [lemmatization(review) for review in new_review]
new_review = [remove_punctuations(review) for review in new_review]
new_review = [remove_special_characters(review) for review in new_review]

#predict the rating
new_vectorised = vectorizer.transform(new_review)
new_tfidf = tfidf_transformer.transform(new_vectorised)

print(new_tfidf.toarray())
new_pred = rf.predict(new_tfidf)
print(new_pred)

[5]
