##### Social Media Analytics
### Introduction to Text Mining
## Keywords extraction (using RAKE method)
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import re

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from rake_nltk import Rake

In [2]:
ds = pd.read_parquet("DailyMail.parquet.snappy", engine="fastparquet")

### Functions

In [3]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower()

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ""):
        return np.nan
    else:
        return word_tokenize(words)

In [5]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (" ").join(words)
        return temp_str
    else:
        return np.nan

### Analysis

In [6]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.text.apply(textPreProcess, charsToRemove="", removeNumbers=False).values,
    index=ds.index,
    columns=["PreProcessedText"],
)

In [7]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != ""]

In [12]:
# Find specific terms
termsToSearch = ["last year"]
searchList = re.compile("|".join(termsToSearch))

# Get words from sentences
listOfWords = processedReviews.PreProcessedText.apply(tokenize_words)

# Reconstruct sentences with spaces in the beginning and in the end
ppText = listOfWords.apply(recreateText)

# Look for search terms in sentences and present them
ppText_searched = []
for review in ppText:
    if searchList.search(" " + review + " "):
        ppText_searched.append(review)

In [13]:
# The first 3 reviews
# termsToSearch = ['hygiene', 'clean', 'safe']
ppText_searched[0:3]

["wagner group boss yevgeny prigozhin said today his mercenaries would fight on in ukraine if left alone without having to rely on the 'clowns ' running the russian army.prigozhin , who continues to butt heads with russian officials , said at a training camp on thursday that he would pull his forces out 'if the whole chain [ of command ] is 100 per cent failed and will only be led by clowns who turn people into meat . 'the group 's leader , who was celebrating his 62nd birthday , has offered up his forces in brutal human wave strikes into ukraine since the war started , but has clashed with russian commanders over strategy and 'withheld ' supplies needed to sustain his assaults.he also confirmed today that his men would finally leave the eastern city of bakhmut on june 5 after handing it over to the russian army . the wagner group captured the besieged city after months of relentless shelling and surrounding trench warfare.speaking to russian reporters , prigozhin looked on across a ni