# Natural Language Processing - Hotel Reviews Sentiment Prediction

## Libraries and settings

In [None]:
# Libraries
import os
import re
import numpy as np
import pandas as pd
import random
import string

import matplotlib.pyplot as plt
from wordcloud import WordCloud

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Set seed
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

## Import hotel reviews
- For detailed description of data see: https://www.kaggle.com/datasets/andrewmvd/trip-advisor-hotel-reviews

In [None]:
# Import original data
data_orig = pd.read_csv('tripadvisor_hotel_reviews.csv', sep=",", encoding='utf-8')

# Subset of the data
sub_01 = data_orig.loc[data_orig['Rating'].isin([1])].sample(n=1000, random_state=42)
sub_03 = data_orig.loc[data_orig['Rating'].isin([3])].sample(n=1000, random_state=42)
sub_05 = data_orig.loc[data_orig['Rating'].isin([5])].sample(n=1000, random_state=42)

# Concatenating the subsets to one data frame
data = pd.concat([sub_01, sub_03, sub_05]).reset_index(drop=True)
print('Number of reviews in data: {}'.format(len(data)))

data

## Check missing values

In [None]:
data.isna().sum()

## Number of hotel reviews per sentiment class in 'data'

In [None]:
df_class = data['Rating'].value_counts()
df_class

## Wordcloud with most frequently used words

In [None]:
# Wordcloud function
def wordCloud_generator(data, title=None):
    
    wordcloud = WordCloud(height=300,
                          width=600,
                          background_color ='white',
                          min_font_size = 8
                         ).generate(" ".join(data.values))
    
    # Plot the WordCloud image                        
    plt.figure(figsize = (6, 4), facecolor = None) 
    plt.imshow(wordcloud, interpolation='bilinear') 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(title,fontsize=16)
    plt.show()
    
# Create Wordcloud
wordCloud_generator(data['Review'], title="Most used words in reviews")

## Define X and y data

In [None]:
X = data['Review'].copy()
y = data['Rating'].copy()

print('Hotel Reviews:')
print(X)

print('\nSentiments:')
print(y)

## Distribution of sentence length

In [None]:
# Calculate length
length_dist = [len(x.split(" ")) for x in X]

# Plot Histogram
fig = plt.figure( figsize=(7,4))
n, bins, patches = plt.hist(x=length_dist, 
                            bins=30, 
                            color='#42AD12',
                            alpha=0.5, 
                            rwidth=0.95
                   )
plt.ticklabel_format(style='plain')

# Set x limits
plt.xlim(0, 1000)

# Set labels
plt.xlabel('length of sentences', fontsize=10, labelpad=10)
plt.ylabel('Frequency', fontsize=10, labelpad=10)
plt.title('Distribution of the length of sentences', fontsize=12, pad=10)

plt.show()

## Text preprocessing
### Steps:
- Removing punctuations
- Text to lowercase
- Tokenization
- Removing stopwords
- Stemming/Lemmatization

### Removing punctuations

In [None]:
# Defining the function to remove punctuations
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

# Storing the puntuation free text
X_cleaned_01 = X.apply(lambda x:remove_punctuation(x))
X_cleaned_01

### Text to lowercase

In [None]:
X_cleaned_02 = X_cleaned_01.apply(lambda x: x.lower())
X_cleaned_02

### Tokenization

In [None]:
# Function for tokenization
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens

# Applying function
X_cleaned_03 = X_cleaned_02.apply(lambda x: tokenization(x))
X_cleaned_03

### Removing stopwords

In [None]:
# Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

# Print stopwords
print('English stopwords:')
print(stopwords)

# Defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

# Applying the function
X_cleaned_04 = X_cleaned_03.apply(lambda x:remove_stopwords(x))
X_cleaned_04

### Stemming

In [None]:
# Defining the object for stemming
porter_stemmer = PorterStemmer()

# Defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

# Applying the function (we use X_cleaned_04 here)
X_cleaned_05 = X_cleaned_04.apply(lambda x: stemming(x))
X_cleaned_05

### Lemmatization

In [None]:
# Defining the object for lemmatization
lemmatizer = WordNetLemmatizer()

# Defining a function for lemmatization
def lemm(text):
    lemm_text = [lemmatizer.lemmatize(word, pos='v') for word in text]
    return lemm_text

# Applying the function (we use X_cleaned_04 here)
X_cleaned_06 = X_cleaned_04.apply(lambda x: lemm(x))
X_cleaned_06

## Document-Term Matrix

In [None]:
# Define dummy variable
def dummy_fun(doc):
    return doc

count = CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

# Document-Term Matrix (we use 'X_cleaned_06' as the input)
X_cleaned_07 = count.fit_transform(X_cleaned_06).toarray()
print('Document-Term Matrix:')
print(X_cleaned_07)

# Shape
print('\nShape of the Document-Term Matrix')
print(X_cleaned_07.shape)

# Summary statistics
print('\nSummary statistics')
print(f'min: {np.min(X_cleaned_07):.4f}')
print(f'max: {np.max(X_cleaned_07):.4f}')

## Term Frequency - Inverse Document Frequency (TF-IDF) Matrix

In [None]:
# Define dummy variable
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

# TF-IDF Matrix (we use 'X_cleaned_06' as the input)
X_cleaned_08 = tfidf.fit_transform(X_cleaned_06).toarray()
print('TF-IDF Matrix:')
print(X_cleaned_08)

# Shape
print('\nShape of the TF-IDF Matrix')
print(X_cleaned_08.shape)

# Summary statistics
print('\nSummary statistics')
print(f'min: {np.min(X_cleaned_08):.4f}')
print(f'max: {np.max(X_cleaned_08):.4f}')

## Split data into train/test

In [None]:
# Train/test samples
X_train, X_test, y_train, y_test = train_test_split(X_cleaned_08, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=67)

# Print X_train
print('X_train:')
print(X_train)

# Print y_train
print('\ny_train:')
print(y_train)

## Model Training

In [None]:
# Initialize the random forest model 
rfc = RandomForestClassifier(max_depth=20,
                             n_estimators=200,
                             min_samples_leaf=15,
                             random_state=42)

# Train the random forest model
rfc = rfc.fit(X_train, y_train)

##  Model Evaluation

### Create model predictions

In [None]:
# Predict the target variable
y_pred = rfc.predict(X_test)
y_pred

### Measuring the Accuracy

In [None]:
# Accuracy
print(f'Accuracy: {accuracy_score(y_pred, y_test):.4f}')

### Confusion matrix

In [None]:
# Confusion matrix
print(confusion_matrix(y_test, y_pred))

### Classification Report

In [None]:
# Classification Report
print(classification_report(y_test, 
                            y_pred))

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')