##### Social Media Analytics
### Introduction to Text Mining
## Keywords extraction (using RAKE method)
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
import re
from bs4 import BeautifulSoup
from rake_nltk import Rake

In [2]:
ds = pd.read_parquet("sputnik.parquet.snappy", engine="fastparquet")

### Functions

In [3]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [5]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

### Analysis

In [6]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(data=ds.text.apply(textPreProcess,charsToRemove ='', removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])

In [7]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != '']

In [8]:
# Find specific terms
termsToSearch = ['hygiene', 'clean', 'safe']
searchList =  re.compile('|'.join(termsToSearch))

# Get words from sentences
listOfWords =  processedReviews.PreProcessedText.apply(tokenize_words)

# Reconstruct sentences with spaces in the beginning and in the end
ppText = listOfWords.apply(recreateText)

# Look for search terms in sentences and present them
ppText_searched=[]
for review in ppText:
    if searchList.search(' '+ review +' '):
        ppText_searched.append(review)

In [9]:
# The first 3 reviews
# termsToSearch = ['hygiene', 'clean', 'safe']
ppText_searched[0:3]

["in a statement for the newspaper , the nuclear emergency support team ( nest ) said that the network of atomic sensors was being deployed `` throughout the region '' and would have the ability `` to characterize the size , location and effects of any nuclear explosion . `` the sensors will be able to detect radiation from both a dirty bomb and a nuclear weapon activated in ukraine , the report said . the move would allegedly deny russia any opportunity to use nuclear weapons in ukraine without attribution , the report the statement as saying . the sensors will presumably serve as deterrence because they would make russian decision-makers aware the united states can expose the use of a nuclear weapon as a false-flag operation , the report cited the statement as saying . `` if a nuclear emergency were to occur in ukraine , whether a radiation release from a nuclear reactor or a nuclear weapon detonation , '' the statement said . `` scientific analyses would be rapidly provided to us go