In [16]:
import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import random

nlp=spacy.load('en_core_web_lg')

In [17]:
df=pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [19]:
# create the doc and vocab by selecting random 1000 reviews 
random.seed(42)
sample_reviews = random.sample(df['Review'].tolist(), 1000)
doc = nlp(' '.join(sample_reviews))


KeyboardInterrupt: 

In [None]:
# steps to build frequency based classifier tf idf
# 1. create a vocab of words
# 2. preprocess the text - lower case, remove stop words, lemmatization, remove punctuations, remove numbers, remove special characters, remove extra spaces
# 3. create a document term matrix
# 4. calculate tf idf
# 5. build a classifier - logistic regression, naive bayes, decision tree, random forest, xgboost

In [None]:
#preprocessing
# 1. lower case
# 2. remove stop words
# 3. lemmatization
# 4. remove punctuations
# 5. remove numbers
# 6. remove special characters
# 7. remove extra spaces

In [None]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [None]:
#lower case
def lower_case(text):
    return text.lower()

df['Processed_review'] = df['Review'].apply(lower_case)

df.head()

Unnamed: 0,Review,Rating,Processed_review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso..."


In [None]:
#remove stop words
def remove_stop_words(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return ' '.join(tokens)

df['Processed_review'] = df['Processed_review'].apply(remove_stop_words)
df.head()

Unnamed: 0,Review,Rating,Processed_review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok special charge diamond member hilton decide...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms 4 * experience hotel monaco seattle...
3,"unique, great stay, wonderful time hotel monac...",5,"unique , great stay , wonderful time hotel mon..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay , went seahawk game awes..."


In [None]:
#lemmatization
def lemmatization(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]
    return ' '.join(tokens)

df['Processed_review'] = df['Processed_review'].apply(lemmatization)
df.head()

Unnamed: 0,Review,Rating,Processed_review
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking get good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok special charge diamond member hilton decide...
2,nice rooms not 4* experience hotel monaco seat...,3,nice room 4 * experience hotel monaco seattle ...
3,"unique, great stay, wonderful time hotel monac...",5,"unique , great stay , wonderful time hotel mon..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay , go seahawk game awesom..."


In [None]:
#remove punctuations
def remove_punctuations(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_punct]
    return ' '.join(tokens)

df['Processed_review'] = df['Processed_review'].apply(remove_punctuations)
df.head()


KeyboardInterrupt: 

In [None]:
#remove spcial characters and spaces
def remove_special_characters(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha]
    return ' '.join(tokens)

df['Processed_review'] = df['Processed_review'].apply(remove_special_characters)
df.head()

In [None]:
#create document term matrix
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorised = vectorizer.fit_transform(df['Processed_review'])
vectorised.toarray()
vectorised.shape
y = df['Rating']

In [None]:
#tf idf
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(vectorised)
X.toarray()
X.shape

In [None]:
#classification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df.head()

In [None]:
#random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_pred
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



In [None]:
new_review = ["The hotel was clean and the staff were friendly. The location was perfect for exploring the city."]

#use the same preprocessing steps on the new review
new_review = [lower_case(review) for review in new_review]
new_review = [remove_stop_words(review) for review in new_review]
new_review = [lemmatization(review) for review in new_review]
new_review = [remove_punctuations(review) for review in new_review]
new_review = [remove_special_characters(review) for review in new_review]

#predict the rating
new_vectorised = vectorizer.transform(new_review)
new_tfidf = tfidf_transformer.transform(new_vectorised)
new_pred = rf.predict(new_tfidf)
print(new_pred)