# NLP Hotel Review


In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler    #sets min to zero, max to 1.
from sklearn.preprocessing import StandardScaler  #sets data to Z-scores
from sklearn.preprocessing import RobustScaler    # new, see below


from sklearn.neighbors import KNeighborsClassifier   # non-linear classifier

In [118]:
import re
import string

# SK Packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK
import nltk

import warnings
warnings.filterwarnings('ignore')

In [119]:
# NLTK Packages
# Use the code below to download the NLTK package, a straightforward GUI should pop up
# nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [120]:
df = pd.read_csv('/Users/stevekim/Desktop/python/NLP/Hotel_Reviews.csv')

In [121]:
df = df.sample(frac=0.1)

In [122]:
#top 5 rows only 'object' columns
df.select_dtypes('object').head()

Unnamed: 0,Negative_Review,Positive_Review
301676,Nothing,Staff were friendly patient and helpful
293836,Air conditioning was too loud so couldn t hav...,Location is perfect Within easy walking dista...
125050,Drinks were very pricey Probably just London ...,Breakfasts were great Staff friendly and help...
323435,No Negative,Receptionists were friendly and helpful As a ...
467264,Poor customer service from reception Resturan...,It was close to the venue


In [123]:
X = df.drop(columns=['Reviewer_Score'])
y = df['Reviewer_Score']

In [124]:
X_train, X_test, y_train, y_test=train_test_split(X, y,test_size=0.33)
#train and test entire dataset


In [169]:
stop_words = stopwords.words('english')

def my_tokenizers(document, lemmatization=False, stemming=True):
    '''
    Function for use in Vectorizer that tokenizes the document
    '''
    # remove punctuation
    for punct in string.punctuation:
        document=document.replace(punct,'')

    # tokenize - split on whitespace
    tokenized_document = document.split(' ')

    # remove stopwords before stemming or lemmatization
    tokenized_document = [word for word in tokenized_document if word not in stop_words]

    if stemming==True:
        stemmed_tokens_list = []
        for i in tokenized_document:
            token = PorterStemmer().stem(i)
            stemmed_tokens_list.append(token)
        return stemmed_tokens_list

    if lemmatization==True:
        lemmatized_tokens_list = []
        for i in tokenized_document:
            token = WordNetLemmatizer().lemmatize(i)
            lemmatized_tokens_list.append(token)
        return lemmatized_tokens_list

    return tokenized_document

In [170]:
# 1. Instantiate
bagofwords_positive = CountVectorizer(tokenizer=my_tokenizer, min_df =10, max_features=500)# limit to 500 and mindf= 10
#pass the tokenizer

# 2. Fit for both positive and negative reviews
bagofwords_positive.fit(X_train['Positive_Review'])


# 3. Transform X_train and X_test using the fitted CountVectorizer object
X_train_transformed_positive = bagofwords_positive.transform(X_train['Positive_Review'])
X_test_transformed_positive = bagofwords_positive.transform(X_test['Positive_Review'])


# Preview the shape of the transformed matrices
print(X_train_transformed_positive.shape)
print(X_test_transformed_positive.shape)


(34554, 500)
(17020, 500)


In [171]:
# 1. Instantiate
bagofwords_negative = CountVectorizer(tokenizer=my_tokenizer, min_df =10, max_features=500)# limit to 500 and mindf= 10
#pass the tokenizer

# 2. Fit for negative reviews
bagofwords_negative.fit(X_train['Negative_Review'])


# 3. Transform X_train and X_test using the fitted CountVectorizer object
X_train_transformed_negative = bagofwords_negative.transform(X_train['Negative_Review'])
X_test_transformed_negative = bagofwords_negative.transform(X_test['Negative_Review'])

# Preview the shape of the transformed matrices
print(X_train_transformed_negative.shape)
print(X_test_transformed_negative.shape)


(34554, 500)
(17020, 500)


In [172]:
#use pd.dataFrame add back the negative and positive review scores and drop the original data


In [174]:

# Convert transformed train and test data to array and set into dataframe for positive
train_positive = pd.DataFrame(data = X_train_transformed_positive.toarray(),columns = bagofwords_positive.get_feature_names_out(),index = X_train.index)
test_positive = pd.DataFrame(data = X_test_transformed_positive.toarray(),columns = bagofwords_positive.get_feature_names_out(),index = X_test.index)


# Convert transformed train and test data to array and set into dataframe for positive
train_negative = pd.DataFrame(data = X_train_transformed_negative.toarray(), columns= bagofwords_negative.get_feature_names_out(),index = X_train.index)
test_negative = pd.DataFrame(data = X_test_transformed_negative.toarray(), columns= bagofwords_negative.get_feature_names_out(),index = X_test.index)

#Add prefix to the column for negative  train and test 
train_negative = train_negative.add_prefix('neg_')
test_negative = test_negative.add_prefix('neg_')

#Add prefix to the column for positive  train and test 
train_positive = train_positive.add_prefix('pos_')
test_positive = test_positive.add_prefix('pos_')

#Combine with the original DataFrames
X_train_add_pos_neg = pd.concat([X_train, train_positive, train_negative], axis=1)
X_test_add_pos_neg = pd.concat([X_test, test_positive, test_negative], axis=1)



print(X_train_add_pos_neg.head())
print(X_test_add_pos_neg.head())

        Unnamed: 0  Additional_Number_of_Scoring  Average_Score  \
93477        93477                           687            8.9   
260905      260905                            36            8.9   
146258      146258                           378            8.7   
273660      273660                           269            8.0   
341587      341587                           141            9.2   

                                          Negative_Review  \
93477                                 Absolutely nothing    
260905                                    bathroom design   
146258   Facilities in room to be able to wake up and ...   
273660       Brakefast needs improvments not good at all    
341587                            slightly old fashioned    

        Review_Total_Negative_Word_Counts  Total_Number_of_Reviews  \
93477                                   4                     2752   
260905                                  3                      303   
146258              

In [175]:
# Negative Review - Logistic Regression to predict negative review
logreg = LogisticRegression(C = 0.1) #regularize
logreg.fit(X_train_transformed_negative,y_train)

#Training and test score
print(f"Train Score for negative review: {logreg.score(X_train_transformed_negative, y_train)}")
print(f"Test Score for negative review: {logreg.score(X_test_transformed_negative, y_test)}")

Train Score for negative review: 0.7444868900850842
Test Score for negative review: 0.7333725029377204


In [176]:
#Positive Review - Logistic Regression to predict positive review
logreg = LogisticRegression(C = 0.1)
logreg.fit(X_train_transformed_positive, y_train)

print(f"Train Score for positive review: {logreg.score(X_train_transformed_positive, y_train)}")
print(f"Test Score for positive review: {logreg.score(X_test_transformed_positive, y_test)}")

Train Score for positive review: 0.713983909243503
Test Score for positive review: 0.7024676850763807


Fit a logistic regression model on the data and analyze the test and train accuracy. Find the top 20 words from the positive reviews that are most predictive of a positive sentiment (Reviewer_Score = 1). Similarly, find the top 20 words from the negative reviews that are most predictive of a negative sentiment (Reviewer_Score = 0). What actionable insights can you draw from these?

In [None]:
coefDf = pd.DataFrame({'variable':model.feature_names_in_, 'coef':model.coef_[0]}) #use this to extract the relevant information from logistic regrssion model