##### Social Media Analytics
### Introduction to Text Mining
## Text Annotation
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [10]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import nltk

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('us', 'PRP'), ('ambassador', 'VBP'), ('to', 'TO'), ('russia', 'VB'), ('lynne', 'JJ'), ('tracy', 'NN'), ('visited', 'VBD'), ('paul', 'JJ'), ('whelan', 'NN'), ('on', 'IN'), ('thursday', 'NN'), ('her', 'PRP$'), ('first', 'JJ'), ('visit', 'NN'), ('to', 'TO'), ('the', 'DT'), ('detained', 'VBN'), ('american', 'JJ'), ('since', 'IN'), ('taking', 'VBG'), ('up', 'RP'), ('the', 'DT'), ('post', 'NN'), ('in', 'IN'), ('moscow', 'NN'), ('earlier', 'RBR'), ('this', 'DT'), ('year', 'NN'), ('his', 'PRP$'), ('release', 'NN'), ('remains', 'VBZ'), ('an', 'DT'), ('absolute', 'NN'), ('priority', 'NN'), ('the', 'DT'), ('us', 'PRP'), ('embassy', 'VBP'), ('in', 'IN'), ('moscow', 'NN'), ('said', 'VBD'), ('on', 'IN'), ('twitter', 'NN'), ('whelan', 'NN'), ('is', 'VBZ'), ('serving', 'VBG'), ('out', 'RP'), ('his', 'PRP$'), ('prison', 'NN'), ('sentence', 'NN'), ('at', 'IN'), ('a', 'DT'), ('prison', 'NN'), ('camp', 'NN'), ('in', 'IN'), ('mordovia', 'NN'), ('an', 'DT'), ('eight-hour', 'JJ'), ('drive', 'NN'), ('from',

In [2]:
# Load dataset
dtypes = {'title':'category','author':'category','text':'category'}
ds = pd.read_csv("CNNArticles.csv", sep=",", 
                 error_bad_lines=False, dtype=dtypes, decimal=',', 
                 index_col='Unnamed: 0', parse_dates=['date'])



  ds = pd.read_csv("CNNArticles.csv", sep=",",


### Functions

In [3]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\|[0-9]|--| [ ] ', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

### Analysis

In [5]:
# Create a dataframe with only the description
dsprocessedText =  pd.DataFrame(data=ds.text.apply(textPreProcess).values, index=ds.index, columns=['PreProcessedText']) 

In [7]:
# Tokenize text
dsprocessedText['Words'] =  dsprocessedText['PreProcessedText'].apply(tokenize_words)

#### English

In [11]:

import nltk

# Download the resource
nltk.download('averaged_perceptron_tagger')

# Perform POS tagging
tags = nltk.pos_tag(dsprocessedText.Words[0])
print(tags)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('us', 'PRP'), ('ambassador', 'VBP'), ('to', 'TO'), ('russia', 'VB'), ('lynne', 'JJ'), ('tracy', 'NN'), ('visited', 'VBD'), ('paul', 'JJ'), ('whelan', 'NN'), ('on', 'IN'), ('thursday', 'NN'), ('her', 'PRP$'), ('first', 'JJ'), ('visit', 'NN'), ('to', 'TO'), ('the', 'DT'), ('detained', 'VBN'), ('american', 'JJ'), ('since', 'IN'), ('taking', 'VBG'), ('up', 'RP'), ('the', 'DT'), ('post', 'NN'), ('in', 'IN'), ('moscow', 'NN'), ('earlier', 'RBR'), ('this', 'DT'), ('year', 'NN'), ('his', 'PRP$'), ('release', 'NN'), ('remains', 'VBZ'), ('an', 'DT'), ('absolute', 'NN'), ('priority', 'NN'), ('the', 'DT'), ('us', 'PRP'), ('embassy', 'VBP'), ('in', 'IN'), ('moscow', 'NN'), ('said', 'VBD'), ('on', 'IN'), ('twitter', 'NN'), ('whelan', 'NN'), ('is', 'VBZ'), ('serving', 'VBG'), ('out', 'RP'), ('his', 'PRP$'), ('prison', 'NN'), ('sentence', 'NN'), ('at', 'IN'), ('a', 'DT'), ('prison', 'NN'), ('camp', 'NN'), ('in', 'IN'), ('mordovia', 'NN'), ('an', 'DT'), ('eight-hour', 'JJ'), ('drive', 'NN'), ('from',

In [16]:
# Filter only Nouns
nouns = []
for tag in tags:
    if tag[1][0]=="V":  # if if starts with a "N"
        nouns.append(tag[0])
print(nouns)

['ambassador', 'russia', 'visited', 'detained', 'taking', 'remains', 'embassy', 'said', 'is', 'serving', 'holds', 'was', 'detained', 'sentenced', 'denies', 'described', "'s", 'held', 'are', 'was', 'has', 'spoken', 'was', 'secure', 'brought', 'trevor', 'reed', 'detained', 'is', 'has', 'been', 'designated', 'detained', 'was', 'imprisoned', 'read', 'does', 'have', 'needed', 'provide', 'alleged', 'according', "'ve", 'seen', 'deny', 'having', 'engaged', 'do', 'have', 'allow', 'provide', 'said', 'armed', 'hearing', 'confirmed', 'does', 'spend', 'denied', 'has', 'been', 'think', 'are', 'said', 'lt', 'said', 'tried', 'assassinate', 'was', 'cnn', 'showing', 'were', 'flown', 'compound', 'did', 'show', 'was', 'has', 'denied', 'saying', 'strikes', 'is', 'launching', 'has', 'called', 'was', 'involved', 'be', 'mount', 'is', 'according', 'does', 'initiate', 'existing', 'be', 'challenging', 'sustain', 'testified', 'armed', 'has', 'scaled', 'consider', 'consolidate', 'ensuring', 'become', 'added', 'hai

#### English language 

In [22]:
# English POS Tagg - Using Spacy
import spacy    # May require installation
nlp = spacy.load("en_core_web_sm") # Load language model (python -m spacy download es_core_news_sm). More models in https://spacy.io/models
result = nlp(dsprocessedText.PreProcessedText[0])
for token in result:
  print(token, token.pos_)

us PRON
ambassador NOUN
to ADP
russia PROPN
lynne PROPN
tracy PROPN
visited VERB
paul PROPN
whelan PROPN
on ADP
thursday PROPN
her PRON
first ADJ
visit NOUN
to ADP
the DET
detained VERB
american PROPN
since SCONJ
taking VERB
up ADP
the DET
post NOUN
in ADP
moscow PROPN
earlier ADV
this DET
year NOUN
his PRON
release NOUN
remains VERB
an DET
absolute ADJ
priority NOUN
the DET
us PROPN
embassy NOUN
in ADP
moscow PROPN
said VERB
on ADP
twitter NOUN
   SPACE
whelan NOUN
is AUX
serving VERB
out ADP
his PRON
prison NOUN
sentence NOUN
at ADP
a DET
prison NOUN
camp NOUN
in ADP
mordovia NOUN
an DET
eight NUM
- PUNCT
hour NOUN
drive NOUN
from ADP
moscow PROPN
background NOUN
on ADP
whelan PROPN
's PART
case NOUN
the DET
american ADJ
citizen NOUN
who PRON
also ADV
holds VERB
irish ADJ
british ADJ
and CCONJ
canadian ADJ
citizenship NOUN
was AUX
detained VERB
in ADP
russia PROPN
in ADP
december PROPN
and CCONJ
later ADV
sentenced VERB
to ADP
years NOUN
in ADP
prison NOUN
on ADP
an DET
espionage NOU

thursday PROPN
as SCONJ
we PRON
know VERB
in ADP
these DET
military ADJ
aid NOUN
packages NOUN
there PRON
are VERB
often ADV
practical ADJ
obstacles NOUN
but CCONJ
i PRON
can AUX
tell VERB
you PRON
that SCONJ
there PRON
is VERB
not PART
a DET
single ADJ
doubt NOUN
in ADP
any PRON
of ADP
the DET
belgian ADJ
hearts NOUN
and CCONJ
minds NOUN
that SCONJ
we PRON
need VERB
to PART
continue VERB
to PART
support VERB
ukraine PROPN
de PROPN
croo PROPN
told VERB
a DET
news NOUN
conference NOUN
in ADP
the DET
hague NOUN
alongside ADP
ukrainian PROPN
president PROPN
volodymr VERB
zelensky NOUN
and CCONJ
dutch ADJ
prime PROPN
minister PROPN
mark PROPN
rutte PROPN
   SPACE
de X
croo PROPN
also ADV
said VERB
belgium NOUN
was AUX
examining VERB
how SCONJ
seized VERB
russian ADJ
assets NOUN
could AUX
be AUX
fully ADV
used VERB
to PART
support VERB
ukraine PROPN
s PART
war NOUN
effort NOUN
as ADV
well ADV
as ADP
to PART
aid VERB
the DET
reconstruction NOUN
of ADP
the DET
war NOUN
- PUNCT
torn VERB
count

extraordinary ADJ
allegations NOUN
over ADP
the DET
incident NOUN
on ADP
wednesday PROPN
when SCONJ
asked VERB
by ADP
cnn PROPN
if SCONJ
the DET
kremlin PROPN
believed VERB
the DET
us PROPN
was AUX
behind ADP
the DET
attack NOUN
kremlin PROPN
spokesperson PROPN
dmitry PROPN
peskov NOUN
said VERB
undoubtedly ADV
such ADJ
decisions NOUN
the DET
definition NOUN
of ADP
goals NOUN
the DET
definition NOUN
of ADP
means NOUN
all DET
this PRON
is AUX
dictated VERB
to ADP
kyiv NOUN
from ADP
washington PROPN
   SPACE
we PRON
are AUX
well ADV
aware ADJ
of ADP
this PRON
he PRON
added VERB
we PRON
are AUX
well ADV
aware ADJ
that SCONJ
decisions NOUN
on ADP
such ADJ
actions NOUN
and CCONJ
such ADJ
terrorist ADJ
attacks NOUN
are AUX
not PART
made VERB
in ADP
kyiv NOUN
but CCONJ
in ADP
washington PROPN
and CCONJ
kyiv ADJ
is AUX
already ADV
executing VERB
what PRON
it PRON
is AUX
told VERB
to PART
do VERB
peskov NOUN
said VERB
 
  SPACE
such ADJ
attempts NOUN
to PART
disown VERB
this PRON
both CCONJ
in 

's PART
military ADJ
chief NOUN
said VERB

 SPACE
moscow PROPN
batters NOUN
kherson VERB
the DET
death NOUN
toll NOUN
from ADP
russian ADJ
shelling NOUN
in ADP
the DET
southern ADJ
ukrainian ADJ
city NOUN
and CCONJ
its PRON
surrounding VERB
villages NOUN
has AUX
risen VERB
to ADP
at ADP
least ADJ
people NOUN
moscow PROPN
struck VERB
kherson NOUN
at ADP
least ADJ
times NOUN
firing VERB
over ADP
shells NOUN
at ADP
pryvokzalna PROPN
square PROPN
a DET
railway NOUN
station NOUN
and CCONJ
crossing VERB
a DET
gas NOUN
station NOUN
two NUM
stores NOUN
a DET
factory NOUN
and CCONJ
a DET
car NOUN
repair NOUN
shop VERB
the DET
regional ADJ
military ADJ
administration NOUN
said VERB

 SPACE
russian ADJ
oil NOUN
plant NOUN
fire NOUN
a DET
blaze NOUN
broke VERB
out ADP
at ADP
a DET
petroleum NOUN
plant NOUN
in ADP
southwestern ADJ
russia PROPN
on ADP
wednesday PROPN
night NOUN
after SCONJ
a DET
drone NOUN
attack NOUN
according VERB
to ADP
the DET
regional ADJ
governor NOUN
it PRON
was AUX
the DET
t

he PRON
met VERB
with ADP
his PRON
finnish ADJ
and CCONJ
other ADJ
nordic ADJ
counterparts NOUN
on ADP
wednesday PROPN
in ADP
a DET
statement NOUN
ahead ADV
of ADP
that DET
meeting NOUN
the DET
finnish ADJ
president NOUN
said VERB
the DET
prime ADJ
ministers NOUN
of ADP
sweden PROPN
norway NOUN
denmark NOUN
and CCONJ
iceland NOUN
planned VERB
to PART
discuss VERB
the DET
war NOUN
in ADP
ukraine PROPN
and CCONJ
kyiv PROPN
's PART
initiative NOUN
for ADP
a DET
just ADJ
peace NOUN
ukraine NOUN
  SPACE
has AUX
vehemently ADV
denied VERB
russia PROPN
's PART
allegation NOUN
that SCONJ
it PRON
attempted VERB
to PART
assassinate VERB
russian ADJ
president NOUN
  SPACE
vladimir PROPN
putin PROPN
in ADP
a DET
drone NOUN
strike NOUN
at ADP
the DET
kremlin PROPN
overnight ADV
on ADP
wednesday PROPN
moscow PROPN
said VERB
putin PROPN
was AUX
not PART
in ADP
the DET
building NOUN
at ADP
the DET
time NOUN
of ADP
the DET
attack NOUN
meanwhile ADV
a DET
former ADJ
russian ADJ
lawmaker NOUN
linked VERB

tuesday PROPN
into ADP
wednesday PROPN
leaving VERB
a DET
crater NOUN
in ADP
a DET
genteel ADJ
freshly ADV
sculpted VERB
lawn NOUN
nobody PRON
died VERB
in ADP
this DET
last ADJ
strike NOUN
but CCONJ
perhaps ADV
only ADV
because SCONJ
the DET
first ADJ
missile NOUN
sent VERB
two NUM
families NOUN
rushing VERB
for ADP
cover NOUN
before SCONJ
the DET
second ADJ
struck VERB
during ADP
the DET
night NOUN
the DET
city NOUN
of ADP
zaporizhzhia PROPN
was AUX
blasted VERB
with ADP
repeated VERB
air NOUN
raid NOUN
sirens VERB
a DET
familiar ADJ
noise NOUN
in ADP
the DET
past ADJ
months NOUN
but CCONJ
this DET
time NOUN
accompanied VERB
by ADP
explosions NOUN
suggesting VERB
moscow PROPN
s PART
escalation NOUN
as SCONJ
russian ADJ
forces NOUN
apparently ADV
send VERB
s PART
missiles NOUN
into ADP
cities NOUN
according VERB
to ADP
local ADJ
officials NOUN
and CCONJ
accounts NOUN
we PRON
don VERB
t PROPN
often ADV
know VERB
when SCONJ
russia PROPN
hits VERB
a DET
military ADJ
target NOUN
in ADP
uk

In [23]:
# Filter only Adjectives
adjectives = []
for token in result:
    if token.pos_=="ADJ":
        adjectives.append(token)
print(adjectives)

[first, absolute, american, irish, british, canadian, better, most, bad, first, unable, last, other, other, other, more, more, national, ukrainian, independent, russian, much, past, ambiguous, silent, exaggerated, american, likely, early, ukrainian, own, ridiculous, able, significant, offensive, ukrainian, successful, national, mandatory, secure, substantial, third, modest, offensive, near, eastern, southern, likely, political, russian, new, defensive, ukrainian, less, previous, russian, ukrainian, persistent, significant, critical, undersea, undersea, other, critical, western, ukraine, more, international, undersea, worth, financial, economic, significant, own, other, holistic, multi, -, spectrum, multi, -, domain, critical, private, crucial, clear, purported, russian, ludicrous, strategic, individual, latest, extraordinary, aware, such, such, terrorist, kyiv, new, military, belgian, military, practical, single, belgian, dutch, russian, dutch, military, kyiv, f-, unresolved, several, 

In [24]:
# Filter only Verbs
verbs = []
for token in result:
    if token.pos_=="VERB":
        verbs.append(token)
print(verbs)

[visited, detained, taking, remains, said, serving, holds, detained, sentenced, denies, described, held, spoken, secure, brought, detained, trevor, detained, designated, detained, imprisoned, read, have, needed, provide, alleged, according, haines, seen, deny, engaged, have, allow, provide, said, confirmed, spend, denied, think, said, said, tried, assassinate, showing, support, flown, show, denied, saying, strikes, launching, called, involved, mount, according, initiate, existing, challenging, sustain, testified, scaled, consider, consolidate, occupied, ensuring, become, added, said, negotiates, alter, noted, preparing, gained, stands, is, target, allied, including, said, are, heightened, target, disrupt, gain, providing, told, according, transmitted, carry, estimated, said, adding, mapping, said, added, working, developing, testing, monitoring, stated, set, comes, allied, said, denied, accused, lying, tell, lying, mean, had, do, know, happened, said, kirby, assure, had, has, said, end