# Named Entity Recognition
Based on: Social Media Analytics – Introduction to Text Mining – Named Entity Recognition

by (c) Nuno Antonio 2019-2021


### Initial setup

In [18]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
from collections import Counter
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji

In [13]:
# Load dataset
base_path = "Data/"
ds = pd.read_excel(base_path + "Tweets_cleaned.xlsx")

### Functions

In [19]:
# Replace emojis and smileys

# Converting emojis to words
# Using both emot and emoji package to cover missing emojis
def convert_emojis(text):
    # from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))

    emoji.demojize(text, delimiters=("", "")) 
    return text
# Converting emoticons to words   
# from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d 
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def remove_emoji(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "")
    return text

def remove_emoticon(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "", text)
    return text



In [20]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True, convert_emojis=False, remove_emojis = False):
    cleanedText = []
    for x in (rawText[:]): 
        
        # Remove HTML
        if removeHTML:
            procText = BeautifulSoup(x,'html.parser').get_text()

        if convert_emojis:
            procText = convert_emojis(procText)
            procText = convert_emoticons(procText)
        
        if remove_emojis:
            procText = remove_emoji(procText)
            procText = remove_emoticon(procText)

         # Remove punctuation and other special characters
        if len(charsToRemove)>0:
            procText = re.sub(charsToRemove,' ',procText)

        # Remove numbers
        if removeNumbers:
            procText = re.sub(r'\d+',' ',procText)

        # Remove line breaks
        if removeLineBreaks:
            procText = procText.replace('\n',' ').replace('\r', '')

        # Remove special characters
        if len(specialCharsToRemove)>0:
            procText = re.sub(specialCharsToRemove,' ',procText)

        # Normalize to lower case
        if convertToLower:
            procText = procText.lower() 

        # Replace multiple consecutive spaces with just one space
        if removeConsecutiveSpaces:
            procText = re.sub(' +', ' ', procText)

        # If there is a text, add it to the clean text         
        cleanedText.append(procText)


    return cleanedText

### Analysis

In [21]:
# Create a dataframe with only the description
ppText = textPreProcess(ds.text, charsToRemove='', removeLineBreaks=True, removeNumbers=False, convertToLower=False, remove_emojis=True)
processedTweets =  pd.DataFrame(data=ppText, index=ds.index, columns=['PreProcessedText']) 

In [22]:
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [23]:
# Load Spacy English model
nlp = spacy.load("en_core_web_sm")

In [24]:
# Check entities in review
print(processedTweets['PreProcessedText'][0])
doc = nlp(processedTweets['PreProcessedText'][0])
print([(X.text, X.label_) for X in doc.ents])

Doesn't vaccine approval typically take 6 and months? Just asking as I'm not too familiar with their process.
[('6 and months', 'DATE')]


In [25]:
# Count the labels
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'DATE': 1})

In [26]:
# Show top 3 labels
top_labels = [x.text for x in doc.ents]
Counter(top_labels).most_common(3)

[('6 and months', 1)]

In [27]:
# Entities visualization
displacy.render(doc, jupyter=True, style='ent')

In [28]:
# For example, if our objective was understand what guests say about the staff language skills we could look for reviews that mention languages
persons = []
norps = []
orgs = []
gpes = []
for r in processedTweets['PreProcessedText']:
    person = []
    norp = []
    org = []
    gpe = []

    doc = nlp(r)
    for e in doc.ents:
        if e.label_=='PERSON':
            person.append(e.text)
        if e.label_=='NORP':
            norp.append(e.text)
        if e.label_=='ORG':
            org.append(e.text)
        if e.label_=='GPE':
            gpe.append(e.text)

    persons.append(";".join(person))
    norps.append(";".join(norp))
    orgs.append(";".join(org))
    gpes.append(";".join(gpe))
ds["Person"] = persons
ds["Norp"] = norps
ds["Org"] = orgs
ds["Gpe"] = gpes


In [29]:
ds

Unnamed: 0,text,user screen name,user followers,url,created at,replies,retweets,likes,mention,hashtag,Person,Norp,Org,Gpe
0,Doesn't vaccine approval typically take 6 and ...,spadesgeek,19,https://mobile.twitter.com,2021-06-10 19:24:26.999999,0,0,0,"MatthewDavidH,EricTopol,TheEconomist,US_FDA,la...",,,,,
1,"You're right, too bad this vaccine doesn't hav...",selirodz,78,http://twitter.com/download/android,2021-06-10 19:24:26.000000,0,0,0,"DharkArk,JoeBiden",,,,,
2,The vaccine is free…❤️,FlowerGirlBaker,1486,http://twitter.com/#!/download/ipad,2021-06-10 19:23:57.000000,0,0,0,"theredshift11,POTUS,studentsfordemo",,,,,
3,. the world needs COVID19 vaccine access ...,DevizesGreens,150,https://mobile.twitter.com,2021-06-10 19:23:49.000000,0,0,0,"BorisJohnson,JustinTrudeau,POTUS,EUCouncil,Reg...","COVID19,G7,EndThePandemic,COVAX",,,,G7
4,But you can get it.\nI remember getting my Mum...,canfixstoopid,355,https://mobile.twitter.com,2021-06-10 19:23:42.000000,0,0,0,"SerendipityOr,Shockwave_Shaun,Ozymandiyaas,Joe...",,Mumps,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22529,The country need to check on people who take o...,LSungoun,90,http://twitter.com/download/android,2021-05-16 04:11:06.000000,0,0,0,"POTUS,HillaryClinton,Jaemyung_Lee",,Joe Biden;Hillary Clinton;Dogisa Jaemyung_Lee,,,
22530,Why can you not go maskless with out vaccine? ...,bcgov115,38,http://twitter.com/download/iphone,2021-05-16 04:11:01.000000,0,0,0,POTUS,,,,,
22531,"You say the immunocomprimised ""should discuss ...",dmdmdtweet,1,http://twitter.com/download/android,2021-05-16 04:08:22.999999,0,0,1,CDCgov,,,,,
22532,"Well, Kate, not everyone is vaccinated and the...",TuffCrusherPlus,83,http://twitter.com/download/iphone,2021-05-16 04:08:03.000000,0,0,0,"50treeK8,OregonGovBrown,CDCgov",,Kate,,,


In [30]:
ds.to_excel(base_path + "Tweets_NER.xlsx", index=False)