##### Social Media Analytics
### Introduction to Text Mining
## Keywords extraction (using RAKE method)
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
import re
from bs4 import BeautifulSoup
from rake_nltk import Rake



In [2]:
# Load dataset
dtypes = {'title':'category','author':'category','text':'category'}
ds = pd.read_csv("CNNArticles.csv", sep=",", 
                 error_bad_lines=False, dtype=dtypes, decimal=',', 
                 index_col='Unnamed: 0', parse_dates=['date'])



  ds = pd.read_csv("CNNArticles.csv", sep=",",


### Functions

In [3]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\|[0-9]|--| [ ] ', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [5]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

### Analysis

In [6]:
# Create a dataframe with only the description
dsprocessedText = pd.DataFrame(data=ds.text.apply(textPreProcess,charsToRemove ='', removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])

In [7]:
dsprocessedText.head()

Unnamed: 0,PreProcessedText
0,us ambassador to russia lynne tracy visited pa...
1,the tight ring of security that surrounds the ...
2,"at first glance, it looks like a sci-fi movie...."
3,russia unleashed its worst attacks on kyiv in ...
4,thousands of people are planning to line the s...


In [8]:
# Remove rows with empty text
dsprocessedText.PreProcessedText = dsprocessedText.PreProcessedText.str.strip()
dsprocessedText = dsprocessedText[dsprocessedText.PreProcessedText != '']

In [9]:
# Find specific terms
termsToSearch = ['propaganda']
searchList =  re.compile('|'.join(termsToSearch))

# Get words from sentences
listOfWords =  dsprocessedText.PreProcessedText.apply(tokenize_words)

# Reconstruct sentences with spaces in the beginning and in the end
ppText = listOfWords.apply(recreateText)

# Look for search terms in sentences and present them
ppText_searched=[]
for review in ppText:
    if searchList.search(' '+ review +' '):
        ppText_searched.append(review)

In [10]:
#termsToSearch = ['laboratory','biological']
ppText_searched[0:3]

["at first glance , it looks like a sci-fi movie . what appear to be two drones , streaking across the night sky in moscow , head straight for the kremlin , on target to hit the historic senate palace , the official residence of vladimir putin . suddenly , just as one passes the russian flag flying atop the building , it explodes , raining fiery shards down on the roof . the video first appeared in the early hours of wednesday on russian social media . the kremlin was slow to react , eventually releasing a statement calling it a `` planned terrorist attack , '' a deliberate attempt by ukraine to assassinate putin , but presenting no evidence . the president was not injured , the kremlin stressed , threatening that `` russia reserves the right to take countermeasures , wherever and whenever it deems appropriate . '' the denial from ukraine 's president volodymyr zelensky was swift : `` we do n't attack putin or moscow ; we fight on our territory . '' a former senior us diplomat called t

In [17]:
# RAKE method - in English
r = Rake(language='english')

In [20]:
# Keywords extraction per review
r.extract_keywords_from_sentences(dsprocessedText['PreProcessedText'])
r.get_ranked_phrases()

# THIS IS NOT GOOD xD

['aeroflot aircompany ikar alrosa air company aurora airlines aviastartu iraero airlines izhavia nordstar airlines nord wind pobeda airlines rossiya airlines rusjet rusline siberia airlines skol airlines smartavia airlines ural airlines utar aviation uvt aero yakutia yamal airlines',
 'canadian prime minister justin trudeau european commission president ursula von der leyen european council president charles michel french president emmanuel macron german chancellor olaf scholz italian prime minister mario draghi japanese prime minister fumio kishida nato secretary general jens stoltenberg polish president andrzej duda romanian president klaus iohannis uk prime minister boris johnson',
 'us president joe biden german chancellor olaf scholz canadian prime minister justin trudeau french president emmanuel macron italian prime minister mario draghi japanese prime minister kishida fumio uk prime minister boris johnson european commission president ursula von der leyen european council presi