##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#nltk.download('punkt')

In [2]:
# Load dataset
ds = pd.read_parquet("sputnik.parquet.snappy", engine='fastparquet')

# Define the desired data types
#dtypes = {'title': 'category', 'author': 'category', 'date': 'datetime64[ns]', 'text': 'category', 'comments': 'object'}

# Convert columns to specified data types
ds['date'] = pd.to_datetime(ds['date'])  # Convert 'date' column to datetime

#ds = ds.astype(dtypes)

In [3]:
ds.head()

Unnamed: 0,link,title,author,date,text,comments
0,/20230501/watch-russian-army-sappers-blow-up-a...,Watch Russian Army Sappers Blow Up Abandoned Ammo,Oleg Burunov https://cdn1.img.sputnikglobe.com...,2023-01-05,The Russian Ministry of Defense (MoD) has rele...,[]
1,/20230501/kiev-lost-over-300-soldiers-over-pas...,Kiev Lost Over 300 Soldiers Over Past 24 Hours...,Sputnik International,2023-01-05,"""Over the past day, the aviation carried out s...",[330 US mercenaries with mostly Ukrainian pass...
2,/20230430/russia-destroys-up-to-200-tonnes-of-...,Russia Destroys Up to 200 Tons of Ukrainian Am...,Sputnik International,2023-04-30,"""As a result of a strike on an echelon at a ra...",[Very soon they will only have stones to throw...
3,/20230430/russian-forces-discover-underground-...,Russian Forces Discover Underground Soledar Ar...,Oleg Burunov https://cdn1.img.sputnikglobe.com...,2023-04-30,The Armed Forces of Ukraine failed in its effo...,[Kudos to the Russian explosive ordnance dispo...
4,/20230430/ukraine-loses-over-480-military-merc...,"Ukraine Loses Over 480 Military, Mercenaries i...",Sputnik International,2023-04-30,"""Over the past 24 hours, over 480 Ukrainian se...",[]


In [4]:
sample_data = ds.head(10)  # Retrieves the first 10 rows
print(sample_data)

                                                link  \
0  /20230501/watch-russian-army-sappers-blow-up-a...   
1  /20230501/kiev-lost-over-300-soldiers-over-pas...   
2  /20230430/russia-destroys-up-to-200-tonnes-of-...   
3  /20230430/russian-forces-discover-underground-...   
4  /20230430/ukraine-loses-over-480-military-merc...   
5  /20230430/watch-russian-multiple-launch-rocket...   
6  /20230430/russian-forces-hit-ukrainian-drone-c...   
7  /20230429/watch-russian-armed-forces-fire-crui...   
8  /20230429/kievs-counteroffensive-bluster-all-a...   
9  /20230429/moscow-ukraine-loses-nearly-600-mili...   

                                               title  \
0  Watch Russian Army Sappers Blow Up Abandoned Ammo   
1  Kiev Lost Over 300 Soldiers Over Past 24 Hours...   
2  Russia Destroys Up to 200 Tons of Ukrainian Am...   
3  Russian Forces Discover Underground Soledar Ar...   
4  Ukraine Loses Over 480 Military, Mercenaries i...   
5  Watch Russian Multiple Launch Rocket System 

### Functions

In [5]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\|[0-9]|--| [ ] |'s |said|says|also|according|Ukrainian|Ukraine|US|Russian|Russia|would", removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)
        

    return procText

In [6]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [7]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

In [8]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [9]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

### Analysis

In [10]:
# Break reviews' into a list of lists sentences
listOfSentences = ds.text.apply(tokenize_sentences)

In [11]:
# Sentences of first review
listOfSentences[0]

['The Russian Ministry of Defense (MoD) has released a video showing the work of army sappers in the zone of Moscow’s special military operation in Ukraine.In footage published on the MoD’s Telegram page, the servicemen are seen performing a controlled explosion of the projectiles that had apparently been abandoned by Ukrainian units and then detected by Russian forces in an unspecified area.The MoD quoted a demining platoon commander as saying that more than 1,000 shells have already been destroyed.',
 '"Sowing machinery will soon ride across these fields and life will return back to normal there," he added.']

In [12]:
# Create a dataframe with only the description
dsprocessedText = pd.DataFrame(data=ds.text.apply(textPreProcess,charsToRemove ='', removeLineBreaks=False, removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])

In [13]:
# Check first review
ds.text[0]

'The Russian Ministry of Defense (MoD) has released a video showing the work of army sappers in the zone of Moscow’s special military operation in Ukraine.In footage published on the MoD’s Telegram page, the servicemen are seen performing a controlled explosion of the projectiles that had apparently been abandoned by Ukrainian units and then detected by Russian forces in an unspecified area.The MoD quoted a demining platoon commander as saying that more than 1,000 shells have already been destroyed."Sowing machinery will soon ride across these fields and life will return back to normal there," he added.'

In [14]:
# Create DataFrame for sentences
#sentences = pd.DataFrame(data=[item for elem in listOfSentences for item in elem], columns=['BaseText'])
sentences = pd.DataFrame(data=[item for elem in listOfSentences for item in elem if pd.notnull(item)], columns=['BaseText'])

In [15]:
# Add a column with the review ID
sentencesPerReview = []
for elem in listOfSentences:
  sentencesPerReview.append(len(elem))
sentences['RevID'] = np.repeat(ds['RevID'].values,sentencesPerReview)

KeyError: 'RevID'

In [None]:
# Preprocess text 
sentences['PreProcessedText'] = sentences['BaseText'].apply(textPreProcess)

In [16]:
# Get words
sentences['Words'] =  sentences['PreProcessedText'].apply(tokenize_words)

In [17]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
sentences['WordsCleaned'] = sentences['Words'].apply(removeStopWords,stop_words=stop_words)

In [18]:
# Recreate sentence without stopwords
sentences['ProcessedText'] = sentences['WordsCleaned'].apply(recreateText)

In [19]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [20]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences['ProcessedText'][0])
print(sentences['ProcessedText'][0],score)

hotel centrally located bars restaurants within minutes walk {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [21]:
# Process sentiment for all sentences
all_scores = []
for t in (sentences['ProcessedText'][:]):
  score = analyser.polarity_scores(t)
  all_scores.append(score)
sentences['Sentiment'] = [c['compound'] for c in all_scores]

In [22]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby('RevID')['Sentiment'].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds['Sentiment'] = meanByReview[ds['RevID']].values

In [23]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples([(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed='right')
x = pd.cut(ds['Sentiment'].to_list(), bins)
x.categories = ['Negative','Neutral','Positive']
ds['Polarity'] = x

In [24]:
# Analysis examples:
# Mean by hotel 
ex1 = ds.groupby('HotelID')['Sentiment'].mean().to_frame()
ex1

Unnamed: 0_level_0,Sentiment
HotelID,Unnamed: 1_level_1
1,0.361068
10,0.376767
11,0.323291
12,0.339835
13,0.369670
...,...
65,0.478573
66,0.452675
7,0.240772
8,0.339550


In [25]:
# Analysis examples:
# Mean by hotel stars and type
ex2 = ds[['HotelType','HotelStars','Sentiment']].groupby(['HotelType','HotelStars'], as_index=False).mean()
ex2

Unnamed: 0,HotelType,HotelStars,Sentiment
0,City,2,0.236023
1,City,3,0.334388
2,City,4,0.343244
3,City,5,0.324848
4,Resort,2,0.302441
5,Resort,3,0.331119
6,Resort,4,0.406917
7,Resort,5,0.369157
