##### Social Media Analytics
### Introduction to Text Mining
## Named Entity Recognition
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [2]:
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('punkt')
!python -m spacy download it_core_news_sm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

In [39]:
# Import packages
import csv
import re
import numpy as np
from collections import Counter
import numpy
import nltk
import numpy as np
import pandas as pd
import spacy
from bs4 import BeautifulSoup
from spacy import displacy

In [7]:
ds = pd.read_parquet("falso-quotidiano.parquet.snappy", engine="fastparquet")

### Functions

In [8]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    # Since we want to find the persons in the articles we don' want to normalize to lower case

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

### Analysis

In [9]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.text.apply(
        textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
    ).values,
    index=ds.index,
    columns=["PreProcessedText"],
)

In [10]:
# Remove rows with empty text
processedReviews.PreProcessedText = processedReviews.PreProcessedText.str.strip()
processedReviews = processedReviews[processedReviews.PreProcessedText != ""]

In [11]:
processedReviews.head()

Unnamed: 0_level_0,PreProcessedText
index,Unnamed: 1_level_1
0,La controffensiva di Kiev? Deve ancora cominci...
1,"Da mesi, ormai, la guerra mostra un sostanzial..."
2,Occupare città russe per guadagnare vantaggio ...
3,Con le armi non si otterrà mai la sicurezza e ...
4,Una chiusura che certifica lo stallo. Le spera...


In [12]:
sentences = []
for review in processedReviews["PreProcessedText"]:

      sentences = sentences + nltk.tokenize.sent_tokenize(str(review))

In [14]:
# Load Spacy Italian model
nlp = spacy.load("it_core_news_sm")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting it-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.5.0/it_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')


In [15]:
#creating a dataframe with each article and the persons mentioned
persons_df = pd.DataFrame({"id": [], "persons": []})

for i, sentence in enumerate(tqdm(sentences)):
  doc = nlp(sentence)
  entities = [(X.text, X.label_) for X in doc.ents]
  persons = []
  for entity in entities:
    if entity[1] == 'PER':
      persons.append(entity[0])

  persons_df.loc[i] = [i, persons]

100%|██████████| 44129/44129 [10:02<00:00, 73.30it/s]


In [16]:
persons_df

Unnamed: 0,id,persons
0,0,[]
1,1,[]
2,2,"[Vittorio Emanuele Parsi, Zelensky, Meloni]"
3,3,[Zelenski]
4,4,[Professor Parsi]
...,...,...
44124,44124,[Putin]
44125,44125,[Mario Draghi]
44126,44126,[Luigi Di Maio]
44127,44127,[]


In [22]:
#cleaning the dataframe
persons_df_clean = pd.DataFrame({"id": [], "persons": []})

#taking just the last name from all the entities that has both first name and second name
for i in tqdm(range(len(persons_df))):
  persons = persons_df.loc[i][1]
  cleaned_persons = []
  for person in persons:

    cleaned_persons.append(person)
  
  persons_df_clean.loc[i] = [i, cleaned_persons]



100%|██████████| 44129/44129 [01:33<00:00, 473.76it/s]


In [23]:
final_persons_df = pd.DataFrame({"id": [], "persons": []})

for i in range(len(persons_df_clean)):
   final_persons_df.loc[i] = [i, list(set(persons_df_clean.loc[i][1]))]




In [24]:
persons_df_clean

Unnamed: 0,id,persons
0,0,[]
1,1,[]
2,2,"[Vittorio Emanuele Parsi, Zelensky, Meloni]"
3,3,[Zelenski]
4,4,[Professor Parsi]
...,...,...
44124,44124,[Putin]
44125,44125,[Mario Draghi]
44126,44126,[Luigi Di Maio]
44127,44127,[]


In [25]:
final_persons_df
edge_dataframe =  pd.DataFrame({"Source": [], "Target": []})

for i in tqdm(range(len(final_persons_df))):
  persons = final_persons_df.loc[i][1]
  for j in range(len(persons)-1):
    for k in range(j+1,len(persons)):
      values_to_add = {"Source": persons[j], "Target": persons[k]}
      row_to_add = pd.Series(values_to_add)
      edge_dataframe = edge_dataframe.append(row_to_add,ignore_index=True) 

100%|██████████| 44129/44129 [00:16<00:00, 2630.40it/s]


In [26]:
edge_dataframe.head(50)

Unnamed: 0,Source,Target
0,Zelensky,Vittorio Emanuele Parsi
1,Zelensky,Meloni
2,Vittorio Emanuele Parsi,Meloni
3,Putin,Zalensky
4,Putin,Zelensky
5,Putin,XI Jinping
6,Zalensky,Zelensky
7,Zalensky,XI Jinping
8,Zelensky,XI Jinping
9,Berlusconi,Salvini


In [27]:
#function to check if there is a number in a string
def num_there(s):
    return any(i.isdigit() for i in s)

In [32]:
for i, row in edge_dataframe.iterrows():


  if num_there(row[0]) == True or num_there(row[1]) == True:
      edge_dataframe = edge_dataframe.drop(i)

  elif len(row[0]) == 1 or len(row[1]) == 1 :
    edge_dataframe = edge_dataframe.drop(i)
  
  elif 'Regina Caeli' in row[0] or 'Regina Caeli' in row[1]: #we drop these row because Regina Coeli is a prison and not a person
    edge_dataframe = edge_dataframe.drop(i)

  elif '.' in row[0] or '.' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif '.' in row[0] or '.' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)

  elif '"' in row[0] or '"' in row[1]:
    edge_dataframe = edge_dataframe.drop(i)


In [33]:
edges_Fatto =  pd.DataFrame({"Source": [], "Target": []})

for i in tqdm(range(len(edge_dataframe))):

  

  person1 = edge_dataframe.iloc[i][0]
  person2 = edge_dataframe.iloc[i][1]

  if person1 == 'Volodymyr Zelensky':
    person1 = 'Zelensky'

  if person2 == 'Volodymyr Zelensky':
    person2 = 'Zelensky'

  if person1 == 'Papa' or person1 == 'Papa Francesco' or person1 == 'Francesco' or person1 == 'Sua Santità' or person1 == 'Bergoglio':
    person1 = 'Papa Francesco'

  if person2 == 'Papa' or person2 == 'Papa Francesco' or person2 == 'Francesco' or person2 == 'Sua Santità' or person2 == 'Bergoglio':
    person2 = 'Papa Francesco'

  if person1 == 'Luigi Di Maio':
    person1 = 'Di Maio'

  if person2 == 'Luigi Di Maio':
    person2 = 'Di Maio'


  if person1 == 'Vladimir Putin':
    person1 = 'Putin'

  if person2 == 'Vladimir Putin':
    person2 = 'Putin'

  if "'" in person1:
    person1 = person1.split("'")[0]
  
  if "'" in person2:
    person2 = person2.split("'")[0]
    

  edges_Fatto.loc[i] = [person1, person2]

100%|██████████| 8682/8682 [00:14<00:00, 591.78it/s]


In [47]:

tqdm.pandas()

for i, row1 in tqdm(edges_Fatto.iterrows(), total=edges_Fatto.shape[0]):
    person1 = row1[0]
    
    for j, row2 in edges_Fatto.iterrows():
        person2 = row2[1]
        
        if person1 in person2:
            edges_Fatto.at[j, 'Target'] = person1
        elif person2 in person1:
            edges_Fatto.at[i, 'Source'] = person2

100%|██████████| 8682/8682 [56:50<00:00,  2.55it/s]


In [48]:
edges_Fatto.head(50)

Unnamed: 0,Source,Target
0,Zelensky,Guterres
1,Zelensky,Guterres
2,Emanuel,Guterres
3,Putin,Guterres
4,Putin,Guterres
5,Putin,Guterres
6,Zalensky,Guterres
7,Zalensky,Guterres
8,Zelensky,Guterres
9,Berlusconi,Guterres


In [49]:
edge_dataframe.to_csv('edges_Fatto.csv',index = False)