# NLP Hotel Review


In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler    #sets min to zero, max to 1.
from sklearn.preprocessing import StandardScaler  #sets data to Z-scores
from sklearn.preprocessing import RobustScaler    # new, see below


from sklearn.neighbors import KNeighborsClassifier   # non-linear classifier

In [90]:
import re
import string

# SK Packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK
import nltk

import warnings
warnings.filterwarnings('ignore')

In [91]:
# NLTK Packages
# Use the code below to download the NLTK package, a straightforward GUI should pop up
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [92]:
df = pd.read_csv('/Users/stevekim/Desktop/python/NLP/Hotel_Reviews.csv')

In [93]:
df = df.sample(frac=0.1)

In [94]:
#top 5 rows only 'object' columns
df.select_dtypes('object').head()

Unnamed: 0,Negative_Review,Positive_Review
26940,The trainee reception staff were very inexper...,The grand building no history was offered by ...
489299,Door was inoperative Had to wait two hours to...,No Positive
3143,Bath room too small,Value for money
471676,No Negative,Very good value for money Friendly and helpfu...
258653,roomservice was disappointing,toilet was bigger than the room


In [95]:
X = df.drop(columns=['Reviewer_Score'])
y = df['Reviewer_Score']

In [96]:
X_train, X_test, y_train, y_test=train_test_split(X, y,test_size=0.33)
#train and test entire dataset


In [97]:
# 1. Instantiate
bagofwords_positive = CountVectorizer(stop_words="english", min_df =10, max_features=500)# limit to 500 and mindf= 10
#pass the tokenizer

# 2. Fit for both positive and negative reviews
bagofwords_positive.fit(X_train['Positive_Review'])


# 3. Transform X_train and X_test using the fitted CountVectorizer object
X_train_transformed_positive = bagofwords_positive.transform(X_train['Positive_Review'])
X_test_transformed_positive = bagofwords_positive.transform(X_test['Positive_Review'])


# Preview the shape of the transformed matrices
print(X_train_transformed_positive.shape)
print(X_test_transformed_positive.shape)


(34554, 500)
(17020, 500)


In [98]:
# 1. Instantiate
bagofwords_negative = CountVectorizer(stop_words="english", min_df =10, max_features=500)# limit to 500 and mindf= 10
#pass the tokenizer

# 2. Fit for negative reviews
bagofwords_negative.fit(X_train['Negative_Review'])


# 3. Transform X_train and X_test using the fitted CountVectorizer object
X_train_transformed_negative = bagofwords_negative.transform(X_train['Negative_Review'])
X_test_transformed_negative = bagofwords_negative.transform(X_test['Negative_Review'])

# Preview the shape of the transformed matrices
print(X_train_transformed_negative.shape)
print(X_test_transformed_negative.shape)


(34554, 500)
(17020, 500)


In [99]:
#use pd.dataFrame add back the negative and positive review scores and drop the original data


In [100]:

# Convert transformed train data to array and set into dataframe
train_positive = pd.DataFrame(X_train_transformed_positive.toarray(),columns = bagofwords_positive.get_feature_names_out())
train_negative = pd.DataFrame(X_train_transformed_negative.toarray(), columns= bagofwords_negative.get_feature_names_out())


X_train_add_pos_neg = pd.concat([X_train.reset_index(drop=True), train_positive, train_negative], axis=1)

print(X_train_add_pos_neg.head())

   Unnamed: 0  Additional_Number_of_Scoring  Average_Score  \
0      307563                           973            8.0   
1       19256                           788            8.5   
2      401841                            71            7.9   
3       17999                           129            9.1   
4       81356                           126            8.6   

                                     Negative_Review  \
0   The cooked breakfast was a slight let down Th...   
1   Very busy pool not the hotel s fault but pool...   
2            The staff at Pura beach were very rude    
3                                           Nothing    
4   Bathroom chilly no heat controls Nowhere to p...   

   Review_Total_Negative_Word_Counts  Total_Number_of_Reviews  \
0                                 16                     4820   
1                                180                     2635   
2                                 10                      503   
3                             

In [101]:
# Negative Review - Logistic Regression to predict negative review
logreg = LogisticRegression(C = 0.1) #regularize
logreg.fit(X_train_transformed_negative,y_train)

#Training and test score
print(f"Train Score for negative review: {logreg.score(X_train_transformed_negative, y_train)}")
print(f"Test Score for negative review: {logreg.score(X_test_transformed_negative, y_test)}")

Train Score for negative review: 0.7432424610754181
Test Score for negative review: 0.7354876615746181


In [102]:
#Positive Review - Logistic Regression to predict positive review
logreg = LogisticRegression(C = 0.1)
logreg.fit(X_train_transformed_positive, y_train)

print(f"Train Score for positive review: {logreg.score(X_train_transformed_positive, y_train)}")
print(f"Test Score for positive review: {logreg.score(X_test_transformed_positive, y_test)}")

Train Score for positive review: 0.6982693754702785
Test Score for positive review: 0.6904230317273795
