##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pickle
import re

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
ds = pd.read_parquet("DailyMail.parquet.snappy", engine="fastparquet")

In [3]:
ds["title"] = ds["title"].astype("string")
ds["author"] = ds["author"].astype("string")
ds["text"] = ds["text"].astype("string")

In [4]:
ds.head()

Unnamed: 0.1,Unnamed: 0,title,author,date,text,link,comments
0,0,Wave of looting ravages Russian border area am...,"Will Stewart, Christian Oliver",03/06/23,A wave of looting has reportedly hit a border ...,https://www.dailymail.co.uk/news/article-12155...,[Russians don't appear to understand that peop...
1,1,EXCLUSIVE: 'This is the beginning of the end o...,Chris Pleasance,03/06/23,"In the dawn hours of May 3, Moscovites awoke t...",https://www.dailymail.co.uk/news/article-12152...,[Putin: How does it feel when your country is ...
2,2,Naked protest at St Peter's Basilica: Man stri...,Christian Oliver,02/06/23,Visitors to St. Peter's Basilica got an unexpe...,https://www.dailymail.co.uk/news/article-12151...,"[Joe Biden needs to put some clothes on., He c..."
3,3,Russia accuses Apple of 'close cooperation' wi...,"Keith Griffith For Dailymail.com, Reuters",01/06/23,Russian security services have accused Apple o...,https://www.dailymail.co.uk/news/article-12150...,[Russia has turned into a criminal gang/mafia ...
4,4,Foster mother who 'rescued' orphans from Ukrai...,Ed Wight,01/06/23,A foster mum who rescued orphans from war-torn...,https://www.dailymail.co.uk/news/article-12148...,[I cannot believe what I have just read! I fe...


In [5]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1277 entries, 0 to 1276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1277 non-null   int64 
 1   title       1277 non-null   string
 2   author      1277 non-null   string
 3   date        1277 non-null   object
 4   text        1277 non-null   string
 5   link        1277 non-null   object
 6   comments    1277 non-null   object
dtypes: int64(1), object(3), string(3)
memory usage: 70.0+ KB


In [6]:
ds = ds.dropna(subset=["text"])

### Functions

In [7]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower()

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

In [8]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ""):
        return np.nan
    else:
        return word_tokenize(words)

In [9]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (" ").join(words)
        return temp_str
    else:
        return np.nan

In [10]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [11]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

### Analysis

In [12]:
# Because a review can express multiple opinions, let's analyze opinions by sentence

# Break reviews' into a list of lists sentencesc
listOfSentences = ds.text.apply(tokenize_sentences)

In [13]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.text.apply(
        textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
    ).values,
    index=ds.index,
    columns=["PreProcessedText"],
)

In [14]:
# Check first review
ds.text[0]

"A wave of looting has reportedly hit a border zone in Russia as Vladimir Putin's authorities lose control of the Belgorod Oblast region.Homes and local shops appeared to be targeted by Russian looters amid the chaos as tens of\xa0thousands of Russians evacuating border areas in the Belgorod region as pro-Ukraine forces hit back at Russia.A video from the town of Shebekino showed a building ablaze after a suspected strike. Ukraine's military has not claimed responsibility for the attacks, or denied it was behind them.Shelling has pounded the settlement for several days, forcing\xa0thousands of residents to flee villages near Russia's southwestern border as shelling intensified.Incandescent over the attacks, Kremlin mouthpieces on Russia's state television called for the deployment of tactical nuclear weapons to target major Ukrainian cities.Nearby village Sobolevka - where two were killed - was heavily hit by drone attacks, according to local officials, with six wounded, including two 

In [15]:
# Sentences of first review
listOfSentences[0]

["A wave of looting has reportedly hit a border zone in Russia as Vladimir Putin's authorities lose control of the Belgorod Oblast region.Homes and local shops appeared to be targeted by Russian looters amid the chaos as tens of\xa0thousands of Russians evacuating border areas in the Belgorod region as pro-Ukraine forces hit back at Russia.A video from the town of Shebekino showed a building ablaze after a suspected strike.",
 "Ukraine's military has not claimed responsibility for the attacks, or denied it was behind them.Shelling has pounded the settlement for several days, forcing\xa0thousands of residents to flee villages near Russia's southwestern border as shelling intensified.Incandescent over the attacks, Kremlin mouthpieces on Russia's state television called for the deployment of tactical nuclear weapons to target major Ukrainian cities.Nearby village Sobolevka - where two were killed - was heavily hit by drone attacks, according to local officials, with six wounded, including

In [16]:
# Create DataFrame for sentences
sentences = pd.DataFrame(
    data=[item for elem in listOfSentences for item in elem], columns=["BaseText"]
)

In [17]:
# Add a column with the review ID
sentencesPerReview = []
for elem in listOfSentences:
    sentencesPerReview.append(len(elem))
sentences["link"] = np.repeat(ds["link"].values, sentencesPerReview)

In [18]:
# Preprocess text
sentences["PreProcessedText"] = sentences["BaseText"].apply(textPreProcess)



In [19]:
# Get words
sentences["Words"] = sentences["PreProcessedText"].apply(tokenize_words)

In [20]:
# Remove stopwords
stop_words = set(stopwords.words("english"))
sentences["WordsCleaned"] = sentences["Words"].apply(
    removeStopWords, stop_words=stop_words
)

In [21]:
# Recreate sentence without stopwords
sentences["ProcessedText"] = sentences["WordsCleaned"].apply(recreateText)

In [22]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [23]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences["ProcessedText"][0])
print(sentences["ProcessedText"][0], score)

wave looting reportedly hit border zone russia vladimir putin 's authorities lose control belgorod oblast region homes local shops appeared targeted russian looters amid chaos tens thousands russians evacuating border areas belgorod region pro ukraine forces hit back russia video town shebekino showed building ablaze suspected strike {'neg': 0.186, 'neu': 0.814, 'pos': 0.0, 'compound': -0.8316}


In [25]:
# Computing sentences sentiment requires around 40 minutes, using this file we have the saved results.
"""
open_file = open("sentences-dailymail.pkl", "rb")
sentences = pickle.load(open_file)
open_file.close()
"""

In [24]:
# Process sentiment for all sentences
all_scores = []
for t in tqdm(sentences["ProcessedText"]):
    try:
        score = analyser.polarity_scores(t)
        all_scores.append(score)
    except:
        all_scores.append(dict({"neg": 0.0, "neu": 0.0, "pos": 0.0, "compound": 0.0}))
sentences["Sentiment"] = [c["compound"] for c in all_scores]

100%|██████████| 28069/28069 [55:20<00:00,  8.45it/s]  


In [25]:
file_name = "sentences-dailymail.pkl"

open_file = open(file_name, "wb")
pickle.dump(sentences, open_file)
open_file.close()

In [26]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby("link")["Sentiment"].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds["Sentiment"] = meanByReview[ds["link"]].values

In [27]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples(
    [(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed="right"
)
x = pd.cut(ds["Sentiment"].to_list(), bins)
x.categories = ["Negative", "Neutral", "Positive"]
ds["Polarity"] = x

In [28]:
ds.head()

Unnamed: 0.1,Unnamed: 0,title,author,date,text,link,comments,Sentiment,Polarity
0,0,Wave of looting ravages Russian border area am...,"Will Stewart, Christian Oliver",03/06/23,A wave of looting has reportedly hit a border ...,https://www.dailymail.co.uk/news/article-12155...,[Russians don't appear to understand that peop...,-0.269744,Negative
1,1,EXCLUSIVE: 'This is the beginning of the end o...,Chris Pleasance,03/06/23,"In the dawn hours of May 3, Moscovites awoke t...",https://www.dailymail.co.uk/news/article-12152...,[Putin: How does it feel when your country is ...,-0.207373,Negative
2,2,Naked protest at St Peter's Basilica: Man stri...,Christian Oliver,02/06/23,Visitors to St. Peter's Basilica got an unexpe...,https://www.dailymail.co.uk/news/article-12151...,"[Joe Biden needs to put some clothes on., He c...",-0.54635,Negative
3,3,Russia accuses Apple of 'close cooperation' wi...,"Keith Griffith For Dailymail.com, Reuters",01/06/23,Russian security services have accused Apple o...,https://www.dailymail.co.uk/news/article-12150...,[Russia has turned into a criminal gang/mafia ...,0.10595,Positive
4,4,Foster mother who 'rescued' orphans from Ukrai...,Ed Wight,01/06/23,A foster mum who rescued orphans from war-torn...,https://www.dailymail.co.uk/news/article-12148...,[I cannot believe what I have just read! I fe...,-0.199058,Negative


In [29]:
ds.to_parquet("dailymailWithSentiment.parquet.snappy", engine="fastparquet")