##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [16]:
# Import packages
import csv
import re

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from feel_it import SentimentClassifier
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm

In [17]:
ds = pd.read_parquet("fattoComments.parquet.snappy", engine="fastparquet")

In [18]:
ds.head()

Unnamed: 0,link,comment
0,https://www.ilfattoquotidiano.it/2023/05/17/da...,"Articolo di una supericialita'imbarazzante,che..."
1,https://www.ilfattoquotidiano.it/2023/05/17/da...,"E’ grazie a menti come queste , che vivono n..."
2,https://www.ilfattoquotidiano.it/2023/05/17/da...,"Già fotografato, il tizio. Quando parla di res..."
3,https://www.ilfattoquotidiano.it/2023/05/17/da...,Diciamo pure che quanto a corruzione l'Ucraina...
4,https://www.ilfattoquotidiano.it/2023/05/17/da...,Oggi ci dicono che Putin è debole e malato e c...


In [19]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22234 entries, 0 to 22233
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   link     22234 non-null  object
 1   comment  22234 non-null  object
dtypes: object(2)
memory usage: 347.5+ KB


In [20]:
ds = ds.dropna(subset=["comment"])

### Functions

In [21]:
# Text preprocessing
def textPreProcess(
    rawText,
    removeHTML=True,
    charsToRemove=r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-",
    removeNumbers=True,
    removeLineBreaks=False,
    specialCharsToRemove=r"[^\x00-\xfd]",
    convertToLower=True,
    removeConsecutiveSpaces=True,
):
    if type(rawText) != str:
        return rawText
    procText = rawText

    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText, "html.parser").get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove) > 0:
        procText = re.sub(charsToRemove, " ", procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r"\d+", " ", procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace("\n", " ").replace("\r", "")

    # Remove special characters
    if len(specialCharsToRemove) > 0:
        procText = re.sub(specialCharsToRemove, " ", procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower()

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(" +", " ", procText)

    return procText

In [22]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

### Analysis

In [23]:
sentiment = pd.DataFrame(columns=["link", "text", "sentiment"])

In [24]:
sentiment.link = ds.link
sentiment.text = ds.comment

In [25]:
sentiment.text = sentiment.text.apply(
    textPreProcess, charsToRemove="", removeLineBreaks=False, removeNumbers=False
)



In [26]:
stop_words = set(stopwords.words("italian"))

In [27]:
def removeStop(article):
    words = article.split()
    new_string = ""
    for word in words:
        isAStopWord = True  # if we have a stop word the boolean value change and we will not add it to the string
        for stop in stop_words:
            if word == stop:
                isAStopWord = False
        if isAStopWord:
            new_string = new_string + " " + word

    return new_string

In [28]:
# Remove stopwords
tqdm.pandas()

stop_words = set(stopwords.words("italian"))

sentiment.text = sentiment.text.progress_apply(removeStop)

100%|██████████| 22234/22234 [00:11<00:00, 1916.80it/s]


In [29]:
# Create sentiment analysis object
sentiment_classifier = SentimentClassifier()

In [30]:
# Process sentiment for all sentences
tqdm.pandas()

sentiment.sentiment = sentiment.text.progress_apply(
    lambda x: sentiment_classifier.predict([x])
)

100%|██████████| 22234/22234 [45:21<00:00,  8.17it/s]     


In [31]:
ds.sentiment = sentiment.sentiment

  ds.sentiment = sentiment.sentiment


In [32]:
ds.head()

Unnamed: 0,link,comment
0,https://www.ilfattoquotidiano.it/2023/05/17/da...,"Articolo di una supericialita'imbarazzante,che..."
1,https://www.ilfattoquotidiano.it/2023/05/17/da...,"E’ grazie a menti come queste , che vivono n..."
2,https://www.ilfattoquotidiano.it/2023/05/17/da...,"Già fotografato, il tizio. Quando parla di res..."
3,https://www.ilfattoquotidiano.it/2023/05/17/da...,Diciamo pure che quanto a corruzione l'Ucraina...
4,https://www.ilfattoquotidiano.it/2023/05/17/da...,Oggi ci dicono che Putin è debole e malato e c...


In [33]:
ds.to_parquet("FattoSentimentComments.parquet.snappy", engine="fastparquet")