##### Social Media Analytics
### Introduction to Text Mining
## Text Annotation
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

In [2]:
# Load dataset
dtypes = {'RevID':'category','Source':'category','HotelID':'category',
  'HotelType':'category','HotelStars':'category','ObsDateGlobalRating':'float64',
  'Language':'category','RevUserName':'category','RevUserLocation':'category','RevOverallRating':'float64'}
ds = pd.DataFrame(pd.read_csv("HotelOnlineReviews.txt",sep="|", 
  error_bad_lines=False, dtype=dtypes, decimal=',', index_col='RevID'))



  ds = pd.DataFrame(pd.read_csv("HotelOnlineReviews.txt",sep="|",
b'Skipping line 12799: expected 21 fields, saw 23\n'
b'Skipping line 37247: expected 21 fields, saw 22\n'


### Functions

In [3]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

### Analysis

In [5]:
# Create a dataframe with only the description
processedReviews =  pd.DataFrame(data=ds.RevDescription.apply(textPreProcess).values, index=ds.index, columns=['PreProcessedText']) 



In [6]:
# Tokenize text
processedReviews['Words'] =  processedReviews['PreProcessedText'].apply(tokenize_words)

#### English

In [7]:
# ENGLISH POS Tagg - Using NLTK
tags = nltk.pos_tag(processedReviews.Words['T4617'])
print(tags)

[('we', 'PRP'), ('stayed', 'VBD'), ('nights', 'NNS'), ('at', 'IN'), ('this', 'DT'), ('resort', 'NN'), ('in', 'IN'), ('july/august', 'NN'), ('we', 'PRP'), ('stayed', 'VBD'), ('in', 'IN'), ('a', 'DT'), ('suitte', 'NN'), ('appartment', 'NN'), ('which', 'WDT'), ('was', 'VBD'), ('very', 'RB'), ('nice', 'JJ'), ('the', 'DT'), ('appartment', 'NN'), ('had', 'VBD'), ('two', 'CD'), ('floors', 'NNS'), ('and', 'CC'), ('everything', 'NN'), ('needed', 'VBN'), ('for', 'IN'), ('a', 'DT'), ('nice', 'JJ'), ('vacation', 'NN'), ('the', 'DT'), ('staff', 'NN'), ('was', 'VBD'), ('friendly', 'RB'), ('and', 'CC'), ('service', 'VB'), ('good', 'JJ'), ('at', 'IN'), ('this', 'DT'), ('buy', 'NN'), ('time', 'NN'), ('of', 'IN'), ('year', 'NN'), ('we', 'PRP'), ('found', 'VBD'), ('the', 'DT'), ('common', 'JJ'), ('area', 'NN'), ('with', 'IN'), ('pools', 'NNS'), ('etc', 'VBP'), ('a', 'DT'), ('little', 'JJ'), ('bit', 'NN'), ('to', 'TO'), ('small', 'JJ'), ('and', 'CC'), ('crowded', 'VBD'), ('the', 'DT'), ('gym', 'NN'), ('co

In [8]:
# Filter only Nouns
nouns = []
for tag in tags:
    if tag[1][0]=="N":  # if if starts with a "N"
        nouns.append(tag[0])
print(nouns)

['nights', 'resort', 'july/august', 'suitte', 'appartment', 'appartment', 'floors', 'everything', 'vacation', 'staff', 'buy', 'time', 'year', 'area', 'pools', 'bit', 'gym', 'gym', 'people', 'crowd', 'resort', 'location', 'quality', 'buildings', 'restaurant', 'restaurants', 'favorites', 'area', 'resort', 'restaurant', 'carvoeiro', 'bon', 'bon', 'restaurant', 'carvoeiro', 'village', 'lot', 'restaurant', 'pictures', 'area', 'algarve']


#### Spanish

In [9]:
# SPANNISH POS Tagg - Using Spacy
import spacy    # May require installation
nlp = spacy.load("es_core_news_sm") # Load language model (python -m spacy download es_core_news_sm). More models in https://spacy.io/models
result = nlp(processedReviews.PreProcessedText['T7883'])
for token in result:
  print(token, token.pos_)

el DET
enclave NOUN
del ADP
hotel NOUN
es AUX
tan ADV
espectacular ADJ
que SCONJ
las DET
vistas NOUN
quitan AUX
la DET
respiración NOUN
la DET
piscina NOUN
es AUX
una DET
auténtica ADJ
gozada ADJ
las DET
habitaciones NOUN
son VERB
grandes ADJ
y CCONJ
tranquilas ADJ
lo PRON
que SCONJ
las PRON
hace AUX
cómodas ADJ
pero CCONJ
la DET
decoración NOUN
y CCONJ
los DET
baños NOUN
están VERB
realmente ADV
anticuados ADJ
el DET
desayuno NOUN
nos PRON
resultó VERB
un DET
poco ADV
decepcionante ADJ
no ADV
por ADP
la DET
cantidad NOUN
sino CCONJ
por ADP
la DET
calidad NOUN
todo PRON
es AUX
bastante ADV
corriente ADJ
y CCONJ
sorprende VERB
por ADP
ejemplo NOUN
tener VERB
solo ADV
un DET
tipo NOUN
de ADP
queso NOUN
cuando SCONJ
hasta ADV
en ADP
el DET
supermercado NOUN
te PRON
desborda VERB
la DET
variedad NOUN
de ADP
quesos NOUN
de ADP
la DET
zona NOUN
y CCONJ
patés VERB


In [10]:
# Filter only Adjectives
adjectives = []
for token in result:
    if token.pos_=="ADJ":
        adjectives.append(token)
print(adjectives)

[espectacular, auténtica, gozada, grandes, tranquilas, cómodas, anticuados, decepcionante, corriente]


#### Portuguese

In [11]:
# PORTUGUESE POS Tagg - Using Spacy
nlp = spacy.load("pt_core_news_sm")           # Load language model (python -m spacy download pt_core_news_sm). More models in https://spacy.io/models
result = nlp(processedReviews.PreProcessedText['T4914'])
for token in result:
  print(token, token.pos_)

é AUX
um DET
hotel NOUN
bastante ADV
bom ADJ
para ADP
famílias NOUN
com ADP
crianças NOUN
as DET
moradias NOUN
individuais ADJ
permitem VERB
estar AUX
mais DET
vontade NOUN
e CCONJ
assim ADV
gozar VERB
mais ADV
os DET
espaço NOUN
é AUX
um DET
pouco ADV
afastado VERB
da DET
praia NOUN
mas CCONJ
compensa VERB
pela DET
calma NOUN
e CCONJ
sossego VERB
o DET
pessoal NOUN
é AUX
muito ADV
agradável ADJ
e CCONJ
prestável ADJ
a DET
comida NOUN
é AUX
boa ADJ
e CCONJ
as DET
acomodações NOUN
modernas ADJ
tem VERB
apenas ADV
um NUM
senão ADV
o DET
barulho NOUN
das DET
rãs ADV
durante ADP
a DET
noite NOUN


In [12]:
# Filter only Verbs
verbs = []
for token in result:
    if token.pos_=="VERB":
        verbs.append(token)
print(verbs)

[permitem, gozar, afastado, compensa, sossego, tem]
