##### Social Media Analytics
### Introduction to Text Mining
## Keywords extraction (using RAKE method)
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
import re
from bs4 import BeautifulSoup
from rake_nltk import Rake



In [2]:
# Load dataset
# Load dataset
ds = pd.read_parquet("sputnik.parquet.snappy", engine='fastparquet')

# Define the desired data types
dtypes = {'title': 'category', 'author': 'category', 'date': 'datetime64[ns]', 'text': 'category', 'comments': 'object'}

# Convert columns to specified data types
ds['date'] = pd.to_datetime(ds['date'])  # Convert 'date' column to datetime

ds = ds.astype(dtypes)

FileNotFoundError: [Errno 2] No such file or directory: 'sputnik.parquet.snappy'

In [3]:
# Drop non-English reviews
ds = ds.drop(ds[ds.Language!='English'].index)

### Functions

In [4]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\|[0-9]|--| [ ] |'s |said|says|also|according|Ukrainian|Ukraine|US|Russian|Russia|would", removeNumbers=True, removeLineBreaks=False, specialCharsToRemove=r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)
        
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(procText)
    filtered_text = [word for word in word_tokens if word.casefold() not in stop_words]
    procText = ' '.join(filtered_text)

    return procText

In [5]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [6]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

### Analysis

In [7]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(data=ds.RevDescription.apply(textPreProcess,charsToRemove ='', removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])



In [8]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != '']

In [9]:
# Find specific terms
termsToSearch = ['hygiene', 'clean', 'safe']
searchList =  re.compile('|'.join(termsToSearch))

# Get words from sentences
listOfWords =  processedReviews.PreProcessedText.apply(tokenize_words)

# Reconstruct sentences with spaces in the beginning and in the end
ppText = listOfWords.apply(recreateText)

# Look for search terms in sentences and present them
ppText_searched=[]
for review in ppText:
    if searchList.search(' '+ review +' '):
        ppText_searched.append(review)

In [10]:
# The first 3 reviews
# termsToSearch = ['hygiene', 'clean', 'safe']
ppText_searched[0:3]

['hotel was clean and the staff helpful and friendly generally noisy , no atmosphere and further from the beach than it originally looked on the photos . the bar and and reception area lacking in any sort of atmosphere .',
 'an excellent hotel lovely breakfast clean towels every day our room was cleaned every day',
 'good clean hotel , in great location . a room overlooking the beach gave a beautiful outlook . ordinary breakfast']

In [11]:
# RAKE method - in English
r = Rake(language='english')

In [12]:
# Keywords extraction per review
r.extract_keywords_from_sentences(processedReviews['PreProcessedText'])
r.get_ranked_phrases()

['para um hotel 4 estrelas estão muito mal servidos de louça e têm um grave problema de formigas',
 'categories arecipriani venicevilla cimbrone ravello italycopocabana beach rio de janeirolizard island australiaice hotel kiruna swedenpera palace istanbulso',
 'pao ok ... mas por favor fiambre de peru e otros ... sao de muita muita fraca qualidade',
 'muy muy buenos !!! las camas suplementarias que nos dieron estaban en el salon de la primera planta',
 'por fin un hotel con difusor para el secador ... las que tenéis el pelo rizado',
 'un petit peu de laisser aller du jeune personnel et des toilettes pas nettoyées',
 'accès direct à la plage et grande amabilité du personnel discret et à disposition du client',
 'ainsi si vous voulez vous rendre en ville cela vous coûtera 60 euros',
 'el tamaño de los apartamentos los servicios son limitados para familias con niños',
 'birthday great suprise clean rooms happy staff friendly staff great food exceptional puddings amazing housekeeping large