##### Social Media Analytics
### Introduction to Text Mining
## Keywords extraction (using RAKE method)
(c) Nuno Antonio 2019-2021

### Initial setup

In [30]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
import re
from bs4 import BeautifulSoup
from rake_nltk import Rake

In [31]:
# Load dataset
dtypes = {'RevID':'category','Source':'category','HotelID':'category',
  'HotelType':'category','HotelStars':'category','ObsDateGlobalRating':'float64',
  'Language':'category','RevUserName':'category','RevUserLocation':'category','RevOverallRating':'float64'}
ds = pd.DataFrame(pd.read_csv("HotelOnlineReviews.txt",sep="|", 
  error_bad_lines=False, dtype=dtypes, decimal=',', index_col='RevID'))

b'Skipping line 12799: expected 21 fields, saw 23\n'
b'Skipping line 37247: expected 21 fields, saw 22\n'


In [32]:
# Drop non-English reviews
ds = ds.drop(ds[ds.Language!='English'].index)

### Functions

In [33]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    cleanedText = []
    for x in (rawText[:]): 
        
        # Remove HTML
        if removeHTML:
            procText = BeautifulSoup(x,'html.parser').get_text()

         # Remove punctuation and other special characters
        if len(charsToRemove)>0:
            procText = re.sub(charsToRemove,' ',procText)

        # Remove numbers
        if removeNumbers:
            procText = re.sub(r'\d+',' ',procText)

        # Remove line breaks
        if removeLineBreaks:
            procText = procText.replace('\n',' ').replace('\r', '')

        # Remove special characters
        if len(specialCharsToRemove)>0:
            procText = re.sub(specialCharsToRemove,' ',procText)

        # Normalize to lower case
        if convertToLower:
            procText = procText.lower() 

        # Replace multiple consecutive spaces with just one space
        if removeConsecutiveSpaces:
            procText = re.sub(' +', ' ', procText)

        # If there is a text, add it to the clean text         
        if procText != '':
            cleanedText.append(procText)
    return cleanedText

In [34]:
# Tokenize texts
def tokenize_words(texts):
    words_new = []
    for w in (texts[:]):
        w_token = word_tokenize(w)
        if w_token != '':
            words_new.append(w_token)
    return words_new

In [35]:
# Function to recreate text from words
def recreateText(words):
    text_new = []
    for w in (words[:]):
        temp_str = (' ').join(w)
        text_new.append(temp_str)
    return text_new

### Analysis

In [36]:
# Create a dataframe with only the description
ppText = textPreProcess(ds.RevDescription, charsToRemove ='', removeNumbers=False)
processedReviews = pd.DataFrame(data=ppText, index=ds.index, columns=['PreProcessedText']) 

In [37]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != '']

In [38]:
# Find specific terms
termsToSearch = ['hygiene', 'clean', 'safe']
searchList =  re.compile('|'.join(termsToSearch))

# Get words from sentences
listOfWords =  tokenize_words(processedReviews.PreProcessedText)

# Reconstruct sentences with spaces in the beginning and in the end
ppText = recreateText(listOfWords)

# Look for search terms in sentences and present them
ppText_searched=[]
for review in ppText:
    if searchList.search(' '+ review +' '):
        ppText_searched.append(review)

In [39]:
# The first 3 reviews
# termsToSearch = ['hygiene', 'clean', 'safe']
ppText_searched[0:3]

['hotel was clean and the staff helpful and friendly generally noisy , no atmosphere and further from the beach than it originally looked on the photos . the bar and and reception area lacking in any sort of atmosphere .',
 'an excellent hotel lovely breakfast clean towels every day our room was cleaned every day',
 'good clean hotel , in great location . a room overlooking the beach gave a beautiful outlook . ordinary breakfast']

In [40]:
# RAKE method - in English
r = Rake(language='english')

In [41]:
# Keywords extraction per review
r.extract_keywords_from_sentences(processedReviews['PreProcessedText'])
r.get_ranked_phrases()

d us therehad',
 'sun sets ). breakfast offers great selection',
 'located beside metro station near city center',
 'lovley hotel .. entertainment really good ..',
 'chose room service .... big mistake !....',
 'incredible afternoon winds kept blowing debris everywhere',
 'well appointed kitchen area .. central pools',
 'minutes walk awayour room 2 floors',
 'el cortes inglez shopping centre',
 'nice location excellent friendly staff decor needs',
 'cheese available every night .. first time',
 '1 pacman machine ...), poor cleaning',
 'one big beds big rooms great staff',
 'brilliant stay would highly recommend ***** nothing',
 'usual hot food plus pancakes cooked fresh',
 'refunded short ... 12 days later !.',
 'lisbon day staff friendly expensive location far',
 'grassy area sun beds worked extremely hard',
 'extremely friendly helpful staff room facing street',
 'location near el cortez ingles',
 'el corté ingles department store',
 'great italian pizzeria within walking distance',
