##### Social Media Analytics
### Introduction to Text Mining
## Text Annotation
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [51]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import spacy

SyntaxError: invalid syntax (4195836472.py, line 10)

In [15]:
ds = pd.read_parquet("sputnikSentimentComplete.parquet.snappy", engine="fastparquet")

In [16]:
ds["date"] = pd.to_datetime(ds["date"], format="%d/%m/%y")

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/henrique/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/henrique/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Functions

In [6]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-| u |\'s|\/\/t|\[|\]|\*", removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [19]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

### Analysis

In [5]:
# Create a dataframe with only the description
processedReviews =  pd.DataFrame(data=ds.RevDescription.apply(textPreProcess).values, index=ds.index, columns=['PreProcessedText']) 



In [23]:
# Tokenize text
processedReviews['Words'] =  processedReviews['PreProcessedText'].apply(tokenize_words)

#### English

In [35]:
Tags=processedReviews.Words.apply(nltk.pos_tag)

In [38]:
print(Tags[0])

[('the', 'DT'), ('russian', 'JJ'), ('ministry', 'NN'), ('of', 'IN'), ('defense', 'NN'), ('mod', 'NN'), ('has', 'VBZ'), ('released', 'VBN'), ('a', 'DT'), ('video', 'NN'), ('showing', 'VBG'), ('the', 'DT'), ('work', 'NN'), ('of', 'IN'), ('army', 'NN'), ('sappers', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('zone', 'NN'), ('of', 'IN'), ('moscow', 'NN'), ('s', 'NN'), ('special', 'JJ'), ('military', 'JJ'), ('operation', 'NN'), ('in', 'IN'), ('ukraine', 'NN'), ('in', 'IN'), ('footage', 'NN'), ('published', 'VBN'), ('on', 'IN'), ('the', 'DT'), ('mod', 'NN'), ('s', 'NN'), ('telegram', 'NN'), ('page', 'NN'), ('the', 'DT'), ('servicemen', 'NNS'), ('are', 'VBP'), ('seen', 'VBN'), ('performing', 'VBG'), ('a', 'DT'), ('controlled', 'JJ'), ('explosion', 'NN'), ('of', 'IN'), ('the', 'DT'), ('projectiles', 'NNS'), ('that', 'WDT'), ('had', 'VBD'), ('apparently', 'RB'), ('been', 'VBN'), ('abandoned', 'VBN'), ('by', 'IN'), ('ukrainian', 'JJ'), ('units', 'NNS'), ('and', 'CC'), ('then', 'RB'), ('detected', 'VBN

In [45]:
# Filter only Nouns
nouns = []
for tags in Tags:
    for tag in tags:
        if tag[1][0]=="N":  # if if starts with a "N"
            nouns.append(tag[0])
print(nouns)



In [56]:
nlp = spacy.load(
    "en_core_web_sm"
)  # Load language model (python -m spacy download es_core_news_sm). More models in https://spacy.io/models
Spacetags= processedReviews.PreProcessedText.apply(nlp)
for token in Spacetags[0]:
    print(token, token.pos_)

the DET
russian ADJ
ministry PROPN
of ADP
defense PROPN
mod PROPN
has AUX
released VERB
a DET
video NOUN
showing VERB
the DET
work NOUN
of ADP
army NOUN
sappers NOUN
in ADP
the DET
zone NOUN
of ADP
moscow PROPN
s PART
special ADJ
military ADJ
operation NOUN
in ADP
ukraine PROPN
in ADP
footage NOUN
published VERB
on ADP
the DET
mod PROPN
s PROPN
telegram PROPN
page NOUN
the DET
servicemen NOUN
are AUX
seen VERB
performing VERB
a DET
controlled VERB
explosion NOUN
of ADP
the DET
projectiles NOUN
that PRON
had AUX
apparently ADV
been AUX
abandoned VERB
by ADP
ukrainian ADJ
units NOUN
and CCONJ
then ADV
detected VERB
by ADP
russian ADJ
forces NOUN
in ADP
an DET
unspecified ADJ
area NOUN
the DET
mod PROPN
quoted VERB
a DET
demining VERB
platoon NOUN
commander NOUN
as ADP
saying VERB
that SCONJ
more ADJ
than ADP
shells NOUN
have AUX
already ADV
been AUX
destroyed VERB
sowing VERB
machinery NOUN
will AUX
soon ADV
ride VERB
across ADP
these DET
fields NOUN
and CCONJ
life NOUN
will AUX
return V

In [57]:
# Filter only Adjectives
adjectives = []
for result in Spacetags:
    for token in result:
        if token.pos_=="ADJ":
            adjectives.append(token)
print(adjectives)

[russian, special, military, ukrainian, russian, unspecified, more, normal, past, ukrainian, armored, ukrainian, armed, donetsk, donetsk, ukrainian, armed, western, various, multiple, anti, military, further, western, armed, underground, unnamed, russian, nearby, ukrainian, incendiary, incendiary, intact, small, common, soviet, second, such, small, other, small, complete, special, military, strategic, possible, ukrainian, important, successful, offensive, donetsk, russian, ukrainian, nearby, latest, heavy, underway, western, russian, russian, airborne, northern, southern, ukrainian, past, ukrainian, foreign, armored, russian, armed, ukrainian, temporary, ukraine, donetsk, temporary, mechanized, ukrainian, armed, ukraine, special, ukrainian, past, ukrainian, russian, russian, grad, russian, multiple, ukrainian, front, special, military, several, russian, ukrainian, military, russian, russian, grad, russian, unmanned, aerial, territorial, armed, ukrainian, structural, bryansk, special, m

In [92]:
df = pd.Series(adjectives)
df = df.astype(str)

In [93]:
bow = df.value_counts(0)

In [94]:
df.head()

0      russian
1      special
2     military
3    ukrainian
4      russian
dtype: object

In [97]:
bow.head(30)

russian          4801
ukrainian        3877
military         3053
special          1457
western          1090
other             835
nuclear           758
lugansk           612
more              605
new               574
armed             554
ukraine           549
foreign           518
last              505
donetsk           497
such              469
international     401
dpr               388
former            377
anti              372
same              341
many              337
latest            336
several           328
european          317
civilian          304
possible          301
high              288
humanitarian      281
first             275
Name: count, dtype: int64

In [83]:
from collections import defaultdict

my_dict = defaultdict(int)
print(my_dict['nonexistent_key'])  # Accessing a non-existent key

# Output: 0


0


In [84]:

    # Create a defaultdict to store word frequencies
    word_frequency = defaultdict(int)
    
    # Count the frequency of each word
    for word in adjectives
        word_frequency[word] += 1
    
    return word_frequency

# Example usage
frequency = count_word_frequency(adjectives)
print(frequency)


NameError: name 'words' is not defined