##### Social Media Analytics
### Introduction to Text Mining
## Named Entity Recognition
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [2]:
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('punkt')
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
!pip install fastparquet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

In [5]:
# Import packages
import csv
import re
from collections import Counter
import numpy
import nltk
import numpy as np
import pandas as pd
import spacy
from bs4 import BeautifulSoup
from spacy import displacy

In [9]:
ds = pd.read_parquet("DailyMail.parquet (2).snappy", engine="fastparquet")

### Functions

In [10]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    # Since we want to find the persons in the articles we don' want to normalize to lower case

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

### Analysis

In [11]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.text.apply(
        textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
    ).values,
    index=ds.index,
    columns=["PreProcessedText"],
)

In [12]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != ""]

In [13]:
processedReviews.head()

Unnamed: 0,PreProcessedText
0,A wave of looting has reportedly hit a border ...
1,"In the dawn hours of May 3, Moscovites awoke t..."
2,Visitors to St. Peter's Basilica got an unexpe...
3,Russian security services have accused Apple o...
4,A foster mum who rescued orphans from war-torn...


In [14]:
sentences = []
for review in processedReviews["PreProcessedText"]:
      sentences = sentences + nltk.tokenize.sent_tokenize(review)

In [15]:
# Load Spacy English model
nlp = spacy.load("en_core_web_sm")

In [16]:
#creating a dataframe with each article and the persons mentioned
persons_df = pd.DataFrame({"id": [], "persons": []})

for i, sentence in enumerate(tqdm(sentences)):
  doc = nlp(sentence)
  entities = [(X.text, X.label_) for X in doc.ents]
  persons = []
  for entity in entities:
    if entity[1] == 'PERSON':
      persons.append(entity[0])

  persons_df.loc[i] = [i, persons]

100%|██████████| 28603/28603 [07:35<00:00, 62.81it/s]


In [17]:
persons_df

Unnamed: 0,id,persons
0,0,[Vladimir Putin's]
1,1,[Putin]
2,2,[]
3,3,[]
4,4,[]
...,...,...
28598,28598,[]
28599,28599,[]
28600,28600,[Putin]
28601,28601,[Putin]


In [18]:
#cleaning the dataframe
persons_df_clean = pd.DataFrame({"id": [], "persons": []})

#taking just the last name from all the entities that has both first name and second name
for i in tqdm(range(len(persons_df))):
  persons = persons_df.loc[i][1]
  cleaned_persons = []
  for person in persons:
    person = person.split(' ')[-1]
    cleaned_persons.append(person)
  
  persons_df_clean.loc[i] = [i, cleaned_persons]



100%|██████████| 28603/28603 [00:51<00:00, 552.98it/s]


In [19]:
final_persons_df = pd.DataFrame({"id": [], "persons": []})

for i in range(len(persons_df_clean)):
   final_persons_df.loc[i] = [i, list(set(persons_df_clean.loc[i][1]))]




In [20]:
persons_df_clean

Unnamed: 0,id,persons
0,0,[Putin's]
1,1,[Putin]
2,2,[]
3,3,[]
4,4,[]
...,...,...
28598,28598,[]
28599,28599,[]
28600,28600,[Putin]
28601,28601,[Putin]


In [21]:
final_persons_df
edge_dataframe =  pd.DataFrame({"Source": [], "Target": []})

for i in tqdm(range(len(final_persons_df))):
  persons = final_persons_df.loc[i][1]
  for j in range(len(persons)-1):
    for k in range(j+1,len(persons)):
      values_to_add = {"Source": persons[j], "Target": persons[k]}
      row_to_add = pd.Series(values_to_add)
      edge_dataframe = edge_dataframe.append(row_to_add,ignore_index=True) 

100%|██████████| 28603/28603 [00:23<00:00, 1202.07it/s]


In [22]:
for i, row in edge_dataframe.iterrows():
  if row[0] == row[1]:
    edge_dataframe = edge_dataframe.drop(i)


In [23]:
edge_dataframe.head(50)

Unnamed: 0,Source,Target
0,Tavolzhanka,Gladkov
1,Solovyov,Putin
2,Gladkov,Belgorod
3,Gladkov,Demidov
4,Gladkov,Belgorod
5,Demidov,Belgorod
6,Coffey,Putin
7,Putin,Great
8,Kyiv,Rogers
9,Kyiv,Putin


In [24]:
#function to check if there is a number in a string
def num_there(s):
    return any(i.isdigit() for i in s)

In [25]:
for i, row in edge_dataframe.iterrows():

  if row[0] == row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif row[0] == row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif num_there(row[0]) == True or num_there(row[1]) == True:
      edge_dataframe = edge_dataframe.drop(i)

  elif len(row[0]) == 1 or len(row[1]) == 1 :
    edge_dataframe = edge_dataframe.drop(i)


  elif '.' in row[0] or '.' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif '.' in row[0] or '.' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif row[0] == 'Kyiv' or row[1] == 'Kyiv':
    edge_dataframe = edge_dataframe.drop(i)

  elif 'Peter' in row[0] or 'Peter' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif 'Twitter' in row[0] or 'Twitter' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif '?' in row[0] or '?' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif '"' in row[0] or '"' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)


In [26]:
edge_dataframe.head(10)

Unnamed: 0,Source,Target
0,Tavolzhanka,Gladkov
1,Solovyov,Putin
2,Gladkov,Belgorod
3,Gladkov,Demidov
4,Gladkov,Belgorod
5,Demidov,Belgorod
6,Coffey,Putin
7,Putin,Great
10,Rogers,Putin
16,Baldachin,Giampaoli


In [27]:
edges_DailyMail =  pd.DataFrame({"Source": [], "Target": []})

for i in tqdm(range(len(edge_dataframe))):

  person1 = edge_dataframe.iloc[i][0]
  person2 = edge_dataframe.iloc[i][1]

  person1 = person1.split('-')[-1]
  person2 = person2.split('-')[-1]

  if "'" in person1:
    person1 = person1.split("'")[0]
  
  if "'" in person2:
    person2 = person2.split("'")[0]
    

  edges_DailyMail.loc[i] = [person1, person2]


100%|██████████| 13185/13185 [00:32<00:00, 401.55it/s]


In [None]:
edges_DailyMail.iloc[515]

In [None]:
edges_DailyMail.to_csv('edges_DailyMail.csv',index = False)