##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pickle
import re

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
ds = pd.read_parquet("sputnik.parquet.snappy", engine="fastparquet")

In [3]:
ds["title"] = ds["title"].astype("string")
ds["author"] = ds["author"].astype("string")
ds["text"] = ds["text"].astype("string")

In [4]:
ds.head()

Unnamed: 0,link,title,author,date,text,comments
0,/20230501/watch-russian-army-sappers-blow-up-a...,Watch Russian Army Sappers Blow Up Abandoned Ammo,Oleg Burunov https://cdn1.img.sputnikglobe.com...,01/05/23,The Russian Ministry of Defense (MoD) has rele...,[]
1,/20230501/kiev-lost-over-300-soldiers-over-pas...,Kiev Lost Over 300 Soldiers Over Past 24 Hours...,Sputnik International,01/05/23,"""Over the past day, the aviation carried out s...",[330 US mercenaries with mostly Ukrainian pass...
2,/20230430/russia-destroys-up-to-200-tonnes-of-...,Russia Destroys Up to 200 Tons of Ukrainian Am...,Sputnik International,30/04/23,"""As a result of a strike on an echelon at a ra...",[Very soon they will only have stones to throw...
3,/20230430/russian-forces-discover-underground-...,Russian Forces Discover Underground Soledar Ar...,Oleg Burunov https://cdn1.img.sputnikglobe.com...,30/04/23,The Armed Forces of Ukraine failed in its effo...,[Kudos to the Russian explosive ordnance dispo...
4,/20230430/ukraine-loses-over-480-military-merc...,"Ukraine Loses Over 480 Military, Mercenaries i...",Sputnik International,30/04/23,"""Over the past 24 hours, over 480 Ukrainian se...",[]


In [5]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1719 entries, 0 to 1718
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   link      1719 non-null   object
 1   title     1719 non-null   string
 2   author    1719 non-null   string
 3   date      1719 non-null   object
 4   text      1719 non-null   string
 5   comments  1719 non-null   object
dtypes: object(3), string(3)
memory usage: 80.7+ KB


In [6]:
ds = ds.dropna(subset=["text"])

### Functions

In [7]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower()

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

In [8]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ""):
        return np.nan
    else:
        return word_tokenize(words)

In [9]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (" ").join(words)
        return temp_str
    else:
        return np.nan

In [10]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [11]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

### Analysis

In [12]:
# Because a review can express multiple opinions, let's analyze opinions by sentence

# Break reviews' into a list of lists sentencesc
listOfSentences = ds.text.apply(tokenize_sentences)

In [13]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.text.apply(
        textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
    ).values,
    index=ds.index,
    columns=["PreProcessedText"],
)

In [14]:
# Check first review
ds.text[0]

'The Russian Ministry of Defense (MoD) has released a video showing the work of army sappers in the zone of Moscow’s special military operation in Ukraine.In footage published on the MoD’s Telegram page, the servicemen are seen performing a controlled explosion of the projectiles that had apparently been abandoned by Ukrainian units and then detected by Russian forces in an unspecified area.The MoD quoted a demining platoon commander as saying that more than 1,000 shells have already been destroyed."Sowing machinery will soon ride across these fields and life will return back to normal there," he added.'

In [15]:
# Sentences of first review
listOfSentences[0]

['The Russian Ministry of Defense (MoD) has released a video showing the work of army sappers in the zone of Moscow’s special military operation in Ukraine.In footage published on the MoD’s Telegram page, the servicemen are seen performing a controlled explosion of the projectiles that had apparently been abandoned by Ukrainian units and then detected by Russian forces in an unspecified area.The MoD quoted a demining platoon commander as saying that more than 1,000 shells have already been destroyed.',
 '"Sowing machinery will soon ride across these fields and life will return back to normal there," he added.']

In [16]:
# Create DataFrame for sentences
sentences = pd.DataFrame(
    data=[item for elem in listOfSentences for item in elem], columns=["BaseText"]
)

In [17]:
# Add a column with the review ID
sentencesPerReview = []
for elem in listOfSentences:
    sentencesPerReview.append(len(elem))
sentences["link"] = np.repeat(ds["link"].values, sentencesPerReview)

In [18]:
# Preprocess text
sentences["PreProcessedText"] = sentences["BaseText"].apply(textPreProcess)



In [19]:
# Get words
sentences["Words"] = sentences["PreProcessedText"].apply(tokenize_words)

In [20]:
# Remove stopwords
stop_words = set(stopwords.words("english"))
sentences["WordsCleaned"] = sentences["Words"].apply(
    removeStopWords, stop_words=stop_words
)

In [21]:
# Recreate sentence without stopwords
sentences["ProcessedText"] = sentences["WordsCleaned"].apply(recreateText)

In [22]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [23]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences["ProcessedText"][0])
print(sentences["ProcessedText"][0], score)

russian ministry defense mod released video showing work army sappers zone moscow special military operation ukraine footage published mod telegram page servicemen seen performing controlled explosion projectiles apparently abandoned ukrainian units detected russian forces unspecified area mod quoted demining platoon commander saying shells already destroyed {'neg': 0.121, 'neu': 0.798, 'pos': 0.082, 'compound': -0.4588}


In [25]:
# Computing sentences sentiment requires around 40 minutes, using this file we have the saved results.

open_file = open("sentences-sputnik.pkl", "rb")
sentences = pickle.load(open_file)
open_file.close()

In [25]:
"""
# Process sentiment for all sentences
all_scores = []
for t in tqdm(sentences["ProcessedText"]):
    try:
        score = analyser.polarity_scores(t)
        all_scores.append(score)
    except:
        all_scores.append(dict({"neg": 0.0, "neu": 0.0, "pos": 0.0, "compound": 0.0}))
sentences["Sentiment"] = [c["compound"] for c in all_scores]
"""

100%|██████████| 15285/15285 [40:46<00:00,  6.25it/s]  


In [30]:
"""
file_name = "sentences-sputnik.pkl"

open_file = open(file_name, "wb")
pickle.dump(sentences, open_file)
open_file.close()
"""

In [33]:
# this row was excluded by sentiment dataset because we had an error during the generation of the score, so doing everything from scratch can fix it
ds = ds[
    ds["link"]
    != "/20221219/progress-and-results-of-russias-special-military-operation-in-ukraine-1099472571.html"
]

In [34]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby("link")["Sentiment"].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds["Sentiment"] = meanByReview[ds["link"]].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds["Sentiment"] = meanByReview[ds["link"]].values


In [35]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples(
    [(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed="right"
)
x = pd.cut(ds["Sentiment"].to_list(), bins)
x.categories = ["Negative", "Neutral", "Positive"]
ds["Polarity"] = x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds["Polarity"] = x


In [37]:
ds.head()

Unnamed: 0,link,title,author,date,text,comments,Sentiment,Polarity
0,/20230501/watch-russian-army-sappers-blow-up-a...,Watch Russian Army Sappers Blow Up Abandoned Ammo,Oleg Burunov https://cdn1.img.sputnikglobe.com...,01/05/23,The Russian Ministry of Defense (MoD) has rele...,[],-0.2294,Negative
1,/20230501/kiev-lost-over-300-soldiers-over-pas...,Kiev Lost Over 300 Soldiers Over Past 24 Hours...,Sputnik International,01/05/23,"""Over the past day, the aviation carried out s...",[330 US mercenaries with mostly Ukrainian pass...,-0.325667,Negative
2,/20230430/russia-destroys-up-to-200-tonnes-of-...,Russia Destroys Up to 200 Tons of Ukrainian Am...,Sputnik International,30/04/23,"""As a result of a strike on an echelon at a ra...",[Very soon they will only have stones to throw...,-0.14535,Negative
3,/20230430/russian-forces-discover-underground-...,Russian Forces Discover Underground Soledar Ar...,Oleg Burunov https://cdn1.img.sputnikglobe.com...,30/04/23,The Armed Forces of Ukraine failed in its effo...,[Kudos to the Russian explosive ordnance dispo...,-0.56968,Negative
4,/20230430/ukraine-loses-over-480-military-merc...,"Ukraine Loses Over 480 Military, Mercenaries i...",Sputnik International,30/04/23,"""Over the past 24 hours, over 480 Ukrainian se...",[],-0.701975,Negative


In [38]:
ds.to_parquet("sputnikWithSentiment.parquet.snappy", engine="fastparquet")