##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pickle
import re

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
ds = pd.read_parquet("sputnikComments.parquet.snappy", engine="fastparquet")

In [4]:
ds.head()

Unnamed: 0,link,comment
0,/20230501/kiev-lost-over-300-soldiers-over-pas...,330 US mercenaries with mostly Ukrainian passp...
1,/20230501/kiev-lost-over-300-soldiers-over-pas...,"330?? figures are coming down, supposed to be ..."
2,/20230430/russia-destroys-up-to-200-tonnes-of-...,Very soon they will only have stones to throw ...
3,/20230430/russia-destroys-up-to-200-tonnes-of-...,why destroy it when you can put it to good use...
4,/20230430/russian-forces-discover-underground-...,Kudos to the Russian explosive ordnance dispos...


In [5]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045 entries, 0 to 1044
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   link     1045 non-null   object
 1   comment  1045 non-null   object
dtypes: object(2)
memory usage: 16.5+ KB


### Functions

In [6]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower()

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

In [7]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ""):
        return np.nan
    else:
        return word_tokenize(words)

In [8]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (" ").join(words)
        return temp_str
    else:
        return np.nan

In [9]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [10]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

### Analysis

In [30]:
# Because a review can express multiple opinions, let's analyze opinions by sentence

# Break reviews' into a list of lists sentencesc
listOfSentences = ds.comment.apply(tokenize_sentences)

In [31]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.comment.apply(
        textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
    ).values,
    index=ds.index,
    columns=["PreProcessedText"],
)

In [32]:
# Check first review
ds.comment[2]

'Very soon they will only have stones to throw at the advancing Russians.'

In [33]:
# Sentences of first review
listOfSentences[2]

['Very soon they will only have stones to throw at the advancing Russians.']

In [34]:
# Create DataFrame for sentences
sentences = pd.DataFrame(
    data=[item for elem in listOfSentences for item in elem], columns=["BaseText"]
)

In [35]:
# Add a column with the review ID
sentencesPerReview = []
for elem in listOfSentences:
    sentencesPerReview.append(len(elem))
sentences["link"] = np.repeat(ds["link"].values, sentencesPerReview)

In [36]:
# Preprocess text
sentences["PreProcessedText"] = sentences["BaseText"].apply(textPreProcess)

In [37]:
# Get words
sentences["Words"] = sentences["PreProcessedText"].apply(tokenize_words)

In [38]:
# Remove stopwords
stop_words = set(stopwords.words("english"))
sentences["WordsCleaned"] = sentences["Words"].apply(
    removeStopWords, stop_words=stop_words
)

In [39]:
# Recreate sentence without stopwords
sentences["ProcessedText"] = sentences["WordsCleaned"].apply(recreateText)

In [40]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [41]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences["ProcessedText"][0])
print(sentences["ProcessedText"][0], score)

us mercenaries mostly ukrainian passports {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [42]:
# Computing sentences sentiment requires around 40 minutes, using this file we have the saved results.
"""
open_file = open("comments-sputnik.pkl", "rb")
sentences = pickle.load(open_file)
open_file.close()
"""

'\nopen_file = open("sentences-sputnik.pkl", "rb")\nsentences = pickle.load(open_file)\nopen_file.close()\n'

In [43]:
# Process sentiment for all sentences
all_scores = []
for t in tqdm(sentences["ProcessedText"]):
    try:
        score = analyser.polarity_scores(t)
        all_scores.append(score)
    except:
        all_scores.append(dict({"neg": 0.0, "neu": 0.0, "pos": 0.0, "compound": 0.0}))
sentences["Sentiment"] = [c["compound"] for c in all_scores]

100%|██████████| 2273/2273 [04:44<00:00,  7.98it/s]


In [46]:
file_name = "comments-sputnik.pkl"

open_file = open(file_name, "wb")
pickle.dump(sentences, open_file)
open_file.close()

In [47]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby("link")["Sentiment"].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds["Sentiment"] = meanByReview[ds["link"]].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds["Sentiment"] = meanByReview[ds["link"]].values


In [48]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples(
    [(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed="right"
)
x = pd.cut(ds["Sentiment"].to_list(), bins)
x.categories = ["Negative", "Neutral", "Positive"]
ds["Polarity"] = x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds["Polarity"] = x


In [49]:
ds.head()

Unnamed: 0,link,comment,Sentiment,Polarity
0,/20230501/kiev-lost-over-300-soldiers-over-pas...,330 US mercenaries with mostly Ukrainian passp...,-0.14985,Negative
1,/20230501/kiev-lost-over-300-soldiers-over-pas...,"330?? figures are coming down, supposed to be ...",-0.14985,Negative
2,/20230430/russia-destroys-up-to-200-tonnes-of-...,Very soon they will only have stones to throw ...,-0.31845,Negative
3,/20230430/russia-destroys-up-to-200-tonnes-of-...,why destroy it when you can put it to good use...,-0.31845,Negative
4,/20230430/russian-forces-discover-underground-...,Kudos to the Russian explosive ordnance dispos...,0.1319,Positive


In [50]:
ds.to_parquet("sputnikSentimentComment.parquet.snappy", engine="fastparquet")