# Keywords extraction 

Based on: Social Media Analytics – Introduction to Text Mining – Keywords extraction (using RAKE method)

by (c) Nuno Antonio 2019-2021



### Initial setup

In [14]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re
from bs4 import BeautifulSoup
from rake_nltk import Rake
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji

In [15]:
# Load dataset
base_path = "Data/"
ds = pd.read_excel(base_path + "Tweets_cleaned.xlsx")

In [16]:
# Replace emojis and smileys

# Converting emojis to words
# Using both emot and emoji package to cover missing emojis
def convert_emojis(text):
    # from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))

    emoji.demojize(text, delimiters=("", "")) 
    return text
# Converting emoticons to words   
# from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d 
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def remove_emoji(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "")
    return text

def remove_emoticon(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "", text)
    return text

In [17]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True, convert_emojis=False, remove_emojis = False):
    cleanedText = []
    for x in (rawText[:]): 
        if type(x) != str:
            print("Type: ", str(type(x)))
            x = str(x)

        # Remove HTML
        if removeHTML:
            procText = BeautifulSoup(x,'html.parser').get_text()

        if convert_emojis:
            procText = convert_emojis(procText)
            procText = convert_emoticons(procText)
        
        if remove_emojis:
            procText = remove_emoji(procText)
            procText = remove_emoticon(procText)

         # Remove punctuation and other special characters
        if len(charsToRemove)>0:
            procText = re.sub(charsToRemove,' ',procText)

        # Remove numbers
        if removeNumbers:
            procText = re.sub(r'\d+',' ',procText)

        # Remove line breaks
        if removeLineBreaks:
            procText = procText.replace('\n',' ').replace('\r', '')

        # Remove special characters
        if len(specialCharsToRemove)>0:
            procText = re.sub(specialCharsToRemove,' ',procText)

        # Normalize to lower case
        if convertToLower:
            procText = procText.lower() 

        # Replace multiple consecutive spaces with just one space
        if removeConsecutiveSpaces:
            procText = re.sub(' +', ' ', procText)

        # If there is a text, add it to the clean text         
        cleanedText.append(procText)


    return cleanedText

In [18]:
# Tokenize texts
def tokenize_words(texts):
    words_new = []
    for w in (texts[:]):
        w_token = word_tokenize(w)
        if w_token != '':
            words_new.append(w_token)
    return words_new

In [19]:
# Function to lemmatize words
def lemmatize(words):
  lemmatizer = WordNetLemmatizer()
  procText = []
  for w in (words[:]):
    lemmatized_word = [lemmatizer.lemmatize(x) for x in (w[:])]
    procText.append(lemmatized_word)
  return procText

In [20]:
# Function to remove stop words
def removeStopWords(texts, stop_words):
  procText = []
  for t in (texts[:]):
    cleaned_text = [w for w in t[:] if not w in stop_words]
    procText.append(cleaned_text)
  return procText

In [21]:
# Function to recreate text from words
def recreateText(words):
    text_new = []
    for w in (words[:]):
        temp_str = (' ').join(w)
        text_new.append(temp_str)
    return text_new

### Analysis

In [22]:
# Create a dataframe with only the description
ppText = textPreProcess(ds.text, charsToRemove ='', removeNumbers=False, removeLineBreaks=True)
processedTweets = pd.DataFrame(data=ppText, index=ds.index, columns=['PreProcessedText']) 

Type:  <class 'int'>


In [23]:
# Remove rows with empty text
processedTweets.PreProcessedText = processedTweets.PreProcessedText.str.strip()

In [24]:
# RAKE method - in English
r = Rake(language='english')

In [25]:
# Keywords extraction per review
r.extract_keywords_from_sentences(processedTweets['PreProcessedText'])
phrases = r.get_ranked_phrases()

In [26]:
phrases[0:20]

['paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!! paraguay needs vaccine !!!!',
 'pastor thai jeremiah nguyen tom adsit trang adsit leann adsit tu trang nguyen excuses pastor phu',
 'phu pham thai nguyen chinh nguyen tom trang leann adsit jeremiah nguyen',
 'congressio har jagah badnam krne aa jata hi jese chacha wese',
 'america ga beynun kuran huhdha nudheehuri astrazenaca vaccine ge 60 million doze sending',
 'fatemi khalifatullah mowlana kareem shah al hussayni aga khan iv',
 'profit ... india hai ... bade dil walo ka desh hai',
 'souza interviews k sujatha rao == may 19th 2021 ==== =====',
 'pappua tha jo vaccine ke liy fund de rha tha govt',
 'dawai kare ya yaha par situation co trio mein lye',
 'country muted version 500 million doller withoutloan trapped give economy restoreand

In [87]:
textfile = open("Data/phrases.txt", "w")
for element in phrases:
    textfile.write(element + "\n")
textfile.close()

In [34]:
phrases_300 = phrases[0:300]
textfile = open("Data/phrases300.txt", "w")
for element in phrases_300:
    textfile.write(element + "\n")
textfile.close()