### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2021

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [2]:
# Load dataset
dtypes = {'RevID':'category','Source':'category','HotelID':'category',
  'HotelType':'category','HotelStars':'category','ObsDateGlobalRating':'float64',
  'Language':'category','RevUserName':'category','RevUserLocation':'category','RevOverallRating':'float64'}
ds = pd.DataFrame(pd.read_csv("HotelOnlineReviews.txt",sep="|", 
  error_bad_lines=False, dtype=dtypes, decimal=','))

b'Skipping line 12799: expected 21 fields, saw 23\n'
b'Skipping line 37247: expected 21 fields, saw 22\n'


In [3]:
# Drop non-English reviews
ds = ds.drop(ds[ds.Language!='English'].index)

### Functions

In [4]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    cleanedText = []
    for x in (rawText[:]): 
        
        # Remove HTML
        if removeHTML:
            procText = BeautifulSoup(x,'html.parser').get_text()

         # Remove punctuation and other special characters
        if len(charsToRemove)>0:
            procText = re.sub(charsToRemove,' ',procText)

        # Remove numbers
        if removeNumbers:
            procText = re.sub(r'\d+',' ',procText)

        # Remove line breaks
        if removeLineBreaks:
            procText = procText.replace('\n',' ').replace('\r', '')

        # Remove special characters
        if len(specialCharsToRemove)>0:
            procText = re.sub(specialCharsToRemove,' ',procText)

        # Normalize to lower case
        if convertToLower:
            procText = procText.lower() 

        # Replace multiple consecutive spaces with just one space
        if removeConsecutiveSpaces:
            procText = re.sub(' +', ' ', procText)

        # If there is a text, add it to the clean text         
        if procText != '':
            cleanedText.append(procText)
    return cleanedText

In [5]:
# Tokenize texts
def tokenize_words(texts):
    words_new = []
    for w in (texts[:]):
        w_token = word_tokenize(w)
        if w_token != '':
            words_new.append(w_token)
    return words_new

In [6]:
# Function to recreate text from words
def recreateText(words):
    text_new = []
    for w in (words[:]):
        temp_str = (' ').join(w)
        text_new.append(temp_str)
    return text_new

In [7]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    sentences_new = []
    for s in (texts[:]):
        s_token = sent_tokenize(s)
        sentences_new.append(s_token)
    return sentences_new

In [8]:
# Function to remove stop words
def removeStopWords(texts, stop_words):
  procText = []
  for t in (texts[:]):
    cleaned_text = [w for w in t[:] if not w in stop_words]
    procText.append(cleaned_text)
  return procText

### Analysis

In [9]:
# Because a review can express multiple opinions, let's analyze opinions by sentence

# Break reviews' into a list of lists sentences
listOfSentences = tokenize_sentences(ds.RevDescription)

In [10]:
# Create a dataframe with only the description
ppText = textPreProcess(ds.RevDescription, charsToRemove='', removeLineBreaks=False, removeNumbers=False)
processedReviews =  pd.DataFrame(data=ppText, index=ds.index, columns=['PreProcessedText']) 

In [11]:
# Check first review
ds.RevDescription[0]

'Hotel is  so  centrally located with  bars and restaurants all  within a few minutes walk. Taxis,beach etc. literally on the doorstep. Breakfast excellent. Staff very  friendly and helpful. Will definitely be going  back.þ No  socket  at worktop for electric  kettle so had to place  it on the floor to boil Electric hob not  in use. Shortage of basic  kitchen items to make snack but  we  managed ok.'

In [12]:
# Sentences of first review
listOfSentences[0]

['Hotel is  so  centrally located with  bars and restaurants all  within a few minutes walk.',
 'Taxis,beach etc.',
 'literally on the doorstep.',
 'Breakfast excellent.',
 'Staff very  friendly and helpful.',
 'Will definitely be going  back.þ No  socket  at worktop for electric  kettle so had to place  it on the floor to boil Electric hob not  in use.',
 'Shortage of basic  kitchen items to make snack but  we  managed ok.']

In [13]:
# Create DataFrame for sentences
sentences = pd.DataFrame(data=[item for elem in listOfSentences for item in elem], columns=['BaseText'])

In [14]:
# Add a column with the review ID
sentencesPerReview = []
for elem in listOfSentences:
  sentencesPerReview.append(len(elem))
sentences['RevID'] = np.repeat(ds['RevID'].values,sentencesPerReview)

In [15]:
# Preprocess text 
sentences['PreProcessedText'] = textPreProcess(sentences['BaseText'])

  ' Beautiful Soup.' % markup)


In [16]:
# Get words
sentences['Words'] =  tokenize_words(sentences['PreProcessedText'])

In [17]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
sentences['WordsCleaned'] = removeStopWords(sentences['Words'], stop_words)

In [18]:
# Recreate sentence without stopwords
sentences['ProcessedText'] = recreateText(sentences['WordsCleaned'])

In [19]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [20]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences['ProcessedText'][0])
print(sentences['ProcessedText'][0],score)

hotel centrally located bars restaurants within minutes walk {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [21]:
# Process sentiment for all sentences
all_scores = []
for t in (sentences['ProcessedText'][:]):
  score = analyser.polarity_scores(t)
  all_scores.append(score)
sentences['Sentiment'] = [c['compound'] for c in all_scores]

In [22]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby('RevID')['Sentiment'].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds['Sentiment'] = meanByReview[ds['RevID']].values

In [23]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples([(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed='right')
x = pd.cut(ds['Sentiment'].to_list(), bins)
x.categories = ['Negative','Neutral','Positive']
ds['Polarity'] = x

In [24]:
# Analysis examples:
# Mean by hotel 
ex1 = ds.groupby('HotelID')['Sentiment'].mean().to_frame()
ex1

Unnamed: 0_level_0,Sentiment
HotelID,Unnamed: 1_level_1
1,0.360697
10,0.376747
11,0.323097
12,0.339835
13,0.369670
...,...
65,0.478092
66,0.452380
7,0.240772
8,0.339550


In [25]:
# Analysis examples:
# Mean by hotel stars and type
ex2 = ds[['HotelType','HotelStars','Sentiment']].groupby(['HotelType','HotelStars'], as_index=False).mean()
ex2

Unnamed: 0,HotelType,HotelStars,Sentiment
0,City,2,0.236023
1,City,3,0.334447
2,City,4,0.343238
3,City,5,0.324633
4,Resort,2,0.302441
5,Resort,3,0.330928
6,Resort,4,0.406733
7,Resort,5,0.369126
