##### Social Media Analytics
### Introduction to Text Mining
## Named Entity Recognition
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import re
from collections import Counter

import nltk
import numpy as np
import pandas as pd
import spacy
from bs4 import BeautifulSoup
from spacy import displacy

In [2]:
ds = pd.read_parquet("sputnik.parquet.snappy", engine="fastparquet")

### Functions

In [3]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower()

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

### Analysis

In [4]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.text.apply(
        textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
    ).values,
    index=ds.index,
    columns=["PreProcessedText"],
)

In [5]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != ""]

In [8]:
processedReviews.head()

Unnamed: 0,PreProcessedText
0,the russian ministry of defense (mod) has rele...
1,"""over the past day, the aviation carried out s..."
2,"""as a result of a strike on an echelon at a ra..."
3,the armed forces of ukraine failed in its effo...
4,"""over the past 24 hours, over 480 ukrainian se..."


In [6]:
# Load Spacy English model
nlp = spacy.load("en_core_web_sm")

In [19]:
# Check entities in review
print(processedReviews["PreProcessedText"][0])
doc = nlp(processedReviews["PreProcessedText"][0])
print([(X.text, X.label_) for X in doc.ents])

the russian ministry of defense (mod) has released a video showing the work of army sappers in the zone of moscow s special military operation in ukraine.in footage published on the mod s telegram page, the servicemen are seen performing a controlled explosion of the projectiles that had apparently been abandoned by ukrainian units and then detected by russian forces in an unspecified area.the mod quoted a demining platoon commander as saying that more than 1,000 shells have already been destroyed."sowing machinery will soon ride across these fields and life will return back to normal there," he added.
[('the russian ministry of defense (mod', 'ORG'), ('moscow', 'GPE'), ('russian', 'NORP'), ('more than 1,000', 'CARDINAL')]


In [10]:
# Check entities in review
print(processedReviews["PreProcessedText"][1])
doc = nlp(processedReviews["PreProcessedText"][1])
print([(X.text, X.label_) for X in doc.ents])

"over the past day, the aviation carried out six combat sorties in this area. the group's artillery performed 64 fire tasks up to 330 ukrainian soldiers, one tank, four armored fighting vehicles, four cars, and the d-30 howitzer were destroyed in this direction," the defense ministry said. the defense department added that an ammunition depot of the ukrainian armed forces has been eliminated near the village of novoukrainka in the donetsk people's republic.
[('the past day', 'DATE'), ('six', 'CARDINAL'), ('64', 'CARDINAL'), ('up to', 'CARDINAL'), ('330', 'CARDINAL'), ('one', 'CARDINAL'), ('four', 'CARDINAL'), ('four', 'CARDINAL'), ('d-30', 'PERSON'), ('the defense department', 'ORG'), ('the ukrainian armed forces', 'ORG')]


In [11]:
# Check entities in review
print(processedReviews["PreProcessedText"][2])
doc = nlp(processedReviews["PreProcessedText"][2])
print([(X.text, X.label_) for X in doc.ents])

"as a result of a strike on an echelon at a railway station near the town of kramatorsk of the donetsk people s republic, up to 200 tonnes of uaf s [ukrainian armed forces] ammunition were destroyed," the ministry said. western countries have been supplying kiev with various types of weapon systems, including air defense missiles, multiple launch rocket systems, tanks, self-propelled artillery and anti-aircraft guns, since russia launched its military operation in ukraine over a year ago. the kremlin has repeatedly warned against further arms deliveries to kiev. ukraine has been reportedly preparing to launch a counteroffensive against russia in the spring of this year, with a number of western officials expressing their willingness to help kiev.
[('up to', 'CARDINAL'), ('200 tonnes', 'QUANTITY'), ('russia', 'GPE'), ('a year ago', 'DATE'), ('kremlin', 'ORG'), ('kiev', 'GPE'), ('ukraine', 'GPE'), ('russia', 'GPE'), ('the spring of this year', 'DATE')]


In [12]:
# Count the labels
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'CARDINAL': 1, 'QUANTITY': 1, 'GPE': 4, 'DATE': 2, 'ORG': 1})

In [13]:
# Show top 3 labels
top_labels = [x.text for x in doc.ents]
Counter(top_labels).most_common(3)

[('russia', 2), ('up to', 1), ('200 tonnes', 1)]

In [14]:
# Entities visualization
displacy.render(doc, jupyter=True, style="ent")

In [15]:
# For example, if our objective was understand what guests say about the staff language skills we could look for reviews that mention languages
counter = 0  # to stop after x for demostration speed
annReviews = []
for r in processedReviews["PreProcessedText"]:
    doc = nlp(r)
    for i in doc.ents:
        if i.label_ == "LANGUAGE":
            annReviews.append(r)
            counter = counter + 1
            break
    if counter >= 3:  # Stop after the first three reviews have been found
        break

annReviews

['russia is not likely to be behind the leak of the top secret us intelligence assessment of the situation in ukraine, since it would prefer to keep the information under wraps and use it to its advantage, a major chinese outlet speculates. if russia has obtained these classified documents, it would not post them online, because this will make russia lose the source or sources that had provided these documents, an anonymous chinese international security and intelligence expert told the global times. the leak is unlikely caused by russian intelligence agencies, because this does not make sense, the expert said.the source argued that there was no reason for russia to let its enemies know that it has obtained this intelligence, because this will also make its enemies change plans, making the hard-won military intelligence useless. instead, gt noted, the leak goes to show to the world the disunity, distrust and divergences between the us, its allies and kiev, and to demonstrate that washi