##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [39]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [47]:
# Load dataset
ds = pd.read_csv("CNNArticles.csv", sep=",",
                 error_bad_lines=False,
                 parse_dates=['date'])




  ds = pd.read_csv("CNNArticles.csv", sep=",",


In [48]:
ds.head()

Unnamed: 0.1,Unnamed: 0,title,author,date,text
0,0,Russia's war in Ukraine,"['Jessie Yeung', 'Sana Noor Haq', 'Ivana Kotta...",2023-03-05,US Ambassador to Russia Lynne Tracy visited Pa...
1,1,What we know about the murky drone attack on t...,"['Rob Picheta', 'Anna Chernova', 'Allegra Good...",2023-04-05,The tight ring of security that surrounds the ...
2,2,How the Kremlin drone attack hands Russia an o...,['Jill Dougherty'],2023-04-05,"At first glance, it looks like a sci-fi movie...."
3,3,Wave of Russian attacks on Kyiv worst in a yea...,"['Josh Pennington ', 'Olga Voitovych', 'Helen ...",2023-04-05,Russia unleashed its worst attacks on Kyiv in ...
4,4,"5 things to know for May 4: Atlanta shooting, ...",['Alexandra Meeks'],2023-04-05,Thousands of people are planning to line the s...


### Functions

In [49]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\|[0-9]|--| [ ] ', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [50]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [29]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

In [30]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [31]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

### Analysis

In [32]:
# Because a review can express multiple opinions, let's analyze opinions by sentence

# Break a text into a list of lists sentences
#listOfSentences = ds.text.apply(tokenize_sentences)
#listOfSentences = ds.text.apply(lambda x: tokenize_sentences(x))
#listOfSentences = ds.text.apply(lambda x: [tokenize_sentences(sentence) for sentence in x])
listOfSentences = ds.text.apply(tokenize_sentences)

In [33]:
# Create a dataframe with only the description
dsprocessedText = pd.DataFrame(data=ds.text.apply(textPreProcess,charsToRemove ='', removeLineBreaks=False, removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])

In [34]:
# Check first review
ds.text[0]

'US Ambassador to Russia Lynne Tracy visited Paul Whelan on Thursday— her first visit to the detained American since taking up the post in Moscow earlier this year.  "His release remains an absolute priority," the US Embassy in Moscow said on Twitter.\xa0 Whelan is serving out his prison sentence at a prison camp in Mordovia, an eight-hour drive from Moscow. Background on Whelan\'s case: The American citizen, who also holds Irish, British and Canadian citizenship, was detained in Russia in December 2018 and later sentenced to 16 years in prison on an espionage charge, which he strongly denies.\xa0 In an interview with CNN in December, Whelan described the prison camp\xa0as "better than most in Russia because it\'s mostly foreigners held here, but the conditions are extremely bad." Although Thursday was Tracy\'s first in-person visit, she has spoken by phone with Whelan in the past. The US government was unable to secure Whelan\'s release last year when they brought home two other wrong

In [35]:
# Sentences of first review
listOfSentences[0]

['US Ambassador to Russia Lynne Tracy visited Paul Whelan on Thursday— her first visit to the detained American since taking up the post in Moscow earlier this year.',
 '"His release remains an absolute priority," the US Embassy in Moscow said on Twitter.',
 'Whelan is serving out his prison sentence at a prison camp in Mordovia, an eight-hour drive from Moscow.',
 "Background on Whelan's case: The American citizen, who also holds Irish, British and Canadian citizenship, was detained in Russia in December 2018 and later sentenced to 16 years in prison on an espionage charge, which he strongly denies.",
 'In an interview with CNN in December, Whelan described the prison camp\xa0as "better than most in Russia because it\'s mostly foreigners held here, but the conditions are extremely bad."',
 "Although Thursday was Tracy's first in-person visit, she has spoken by phone with Whelan in the past.",
 "The US government was unable to secure Whelan's release last year when they brought home tw

In [36]:
# Create DataFrame for sentences
sentences = pd.DataFrame(data=[item for elem in listOfSentences for item in elem], columns=['BaseText'])

In [37]:
# Add a column with the review ID

sentencesPerReview = []
for elem in listOfSentences:
    sentencesPerReview.append(len(elem))
sentences['Unnamed:0'] = np.repeat(ds['Unnamed:0'].values, sentencesPerReview)

KeyError: 'Unnamed:0'

In [15]:
# Preprocess text 
sentences['PreProcessedText'] = sentences['BaseText'].apply(textPreProcess)



In [16]:
# Get words
sentences['Words'] =  sentences['PreProcessedText'].apply(tokenize_words)

In [17]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
sentences['WordsCleaned'] = sentences['Words'].apply(removeStopWords,stop_words=stop_words)

In [18]:
# Recreate sentence without stopwords
sentences['ProcessedText'] = sentences['WordsCleaned'].apply(recreateText)

In [19]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [20]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences['ProcessedText'][0])
print(sentences['ProcessedText'][0],score)

hotel centrally located bars restaurants within minutes walk {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [21]:
# Process sentiment for all sentences
all_scores = []
for t in (sentences['ProcessedText'][:]):
  score = analyser.polarity_scores(t)
  all_scores.append(score)
sentences['Sentiment'] = [c['compound'] for c in all_scores]

In [22]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby('RevID')['Sentiment'].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds['Sentiment'] = meanByReview[ds['RevID']].values

In [23]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples([(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed='right')
x = pd.cut(ds['Sentiment'].to_list(), bins)
x.categories = ['Negative','Neutral','Positive']
ds['Polarity'] = x

In [24]:
# Analysis examples:
# Mean by hotel 
ex1 = ds.groupby('HotelID')['Sentiment'].mean().to_frame()
ex1

Unnamed: 0_level_0,Sentiment
HotelID,Unnamed: 1_level_1
1,0.361068
10,0.376767
11,0.323291
12,0.339835
13,0.369670
...,...
65,0.478573
66,0.452675
7,0.240772
8,0.339550


In [25]:
# Analysis examples:
# Mean by hotel stars and type
ex2 = ds[['HotelType','HotelStars','Sentiment']].groupby(['HotelType','HotelStars'], as_index=False).mean()
ex2

Unnamed: 0,HotelType,HotelStars,Sentiment
0,City,2,0.236023
1,City,3,0.334388
2,City,4,0.343244
3,City,5,0.324848
4,Resort,2,0.302441
5,Resort,3,0.331119
6,Resort,4,0.406917
7,Resort,5,0.369157
