##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pickle
import re

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
ds = pd.read_parquet("DailyMailComments.parquet.snappy", engine="fastparquet")

In [3]:
ds.head()

Unnamed: 0,link,comment
0,https://www.dailymail.co.uk/news/article-12155...,Russians don't appear to understand that peopl...
1,https://www.dailymail.co.uk/news/article-12155...,The Russians dont like it up em do they? They ...
2,https://www.dailymail.co.uk/news/article-12155...,"well, russians wanted war."
3,https://www.dailymail.co.uk/news/article-12155...,wait till the war gets to Moscow ....
4,https://www.dailymail.co.uk/news/article-12155...,Russians hate Britain\nAnd what they particula...


In [4]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12029 entries, 0 to 12028
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   link     12029 non-null  object
 1   comment  12029 non-null  object
dtypes: object(2)
memory usage: 188.1+ KB


### Functions

In [5]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower()

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

In [6]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ""):
        return np.nan
    else:
        return word_tokenize(words)

In [7]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (" ").join(words)
        return temp_str
    else:
        return np.nan

In [8]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [9]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

### Analysis

In [10]:
# Because a review can express multiple opinions, let's analyze opinions by sentence

# Break reviews' into a list of lists sentencesc
listOfSentences = ds.comment.apply(tokenize_sentences)

In [11]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(
    data=ds.comment.apply(
        textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
    ).values,
    index=ds.index,
    columns=["PreProcessedText"],
)



In [12]:
# Check first review
ds.comment[2]

'well, russians wanted war.'

In [13]:
# Sentences of first review
listOfSentences[2]

['well, russians wanted war.']

In [14]:
# Create DataFrame for sentences
sentences = pd.DataFrame(
    data=[item for elem in listOfSentences for item in elem], columns=["BaseText"]
)

In [15]:
# Add a column with the review ID
sentencesPerReview = []
for elem in listOfSentences:
    sentencesPerReview.append(len(elem))
sentences["link"] = np.repeat(ds["link"].values, sentencesPerReview)

In [16]:
# Preprocess text
sentences["PreProcessedText"] = sentences["BaseText"].apply(textPreProcess)

In [17]:
# Get words
sentences["Words"] = sentences["PreProcessedText"].apply(tokenize_words)

In [18]:
# Remove stopwords
stop_words = set(stopwords.words("english"))
sentences["WordsCleaned"] = sentences["Words"].apply(
    removeStopWords, stop_words=stop_words
)

In [19]:
# Recreate sentence without stopwords
sentences["ProcessedText"] = sentences["WordsCleaned"].apply(recreateText)

In [20]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [21]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences["ProcessedText"][0])
print(sentences["ProcessedText"][0], score)

russians n't appear understand people tend retaliate attacked {'neg': 0.3, 'neu': 0.7, 'pos': 0.0, 'compound': -0.4588}


In [22]:
# Computing sentences sentiment requires around 40 minutes, using this file we have the saved results.
"""
open_file = open("sentimentCommentsdaily.pkl", "rb")
sentences = pickle.load(open_file)
open_file.close()
"""

'\nopen_file = open("comments-dailymail.pkl", "rb")\nsentences = pickle.load(open_file)\nopen_file.close()\n'

In [23]:
# Process sentiment for all sentences
all_scores = []
for t in tqdm(sentences["ProcessedText"]):
    try:
        score = analyser.polarity_scores(t)
        all_scores.append(score)
    except:
        all_scores.append(dict({"neg": 0.0, "neu": 0.0, "pos": 0.0, "compound": 0.0}))
sentences["Sentiment"] = [c["compound"] for c in all_scores]

100%|██████████| 22386/22386 [50:26<00:00,  7.40it/s]  


In [24]:
file_name = "sentimentCommentsdaily.pkl"

open_file = open(file_name, "wb")
pickle.dump(sentences, open_file)
open_file.close()

In [25]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby("link")["Sentiment"].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds["Sentiment"] = meanByReview[ds["link"]].values

In [26]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples(
    [(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed="right"
)
x = pd.cut(ds["Sentiment"].to_list(), bins)
x.categories = ["Negative", "Neutral", "Positive"]
ds["Polarity"] = x

In [27]:
ds.head()

Unnamed: 0,link,comment,Sentiment,Polarity
0,https://www.dailymail.co.uk/news/article-12155...,Russians don't appear to understand that peopl...,-0.183453,Negative
1,https://www.dailymail.co.uk/news/article-12155...,The Russians dont like it up em do they? They ...,-0.183453,Negative
2,https://www.dailymail.co.uk/news/article-12155...,"well, russians wanted war.",-0.183453,Negative
3,https://www.dailymail.co.uk/news/article-12155...,wait till the war gets to Moscow ....,-0.183453,Negative
4,https://www.dailymail.co.uk/news/article-12155...,Russians hate Britain\nAnd what they particula...,-0.183453,Negative


In [28]:
ds.to_parquet("dailyMailSentimentComment.parquet.snappy", engine="fastparquet")