# Sentiment Analysis
Based on: Social Media Analytics – Introduction to Text Mining – Sentiment Analysis

by (c) Nuno Antonio 2019-2021

### Initial setup

In [3]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import emoji

In [4]:
# Load dataset
base_path = "Data/"
ds = pd.read_excel(base_path + "Tweets_cleaned.xlsx")

In [6]:
# Replace emojis and smileys

# Converting emojis to words
# Using both emot and emoji package to cover missing emojis
def convert_emojis(text):
    # from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))

    emoji.demojize(text, delimiters=("", "")) 
    return text
# Converting emoticons to words   
# from https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d 
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def remove_emoji(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "")
    return text

def remove_emoticon(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "", text)
    return text



In [5]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True, convert_emojis=False, remove_emojis = False):
    cleanedText = []
    for x in (rawText[:]): 
        
        if type(x) != str:
            print("Type: ", str(type(x)))
            x = str(x)
        
        # Remove HTML
        if removeHTML:
            procText = BeautifulSoup(x,'html.parser').get_text()

        if convert_emojis:
            procText = convert_emojis(procText)
            procText = convert_emoticons(procText)
        
        if remove_emojis:
            procText = remove_emoji(procText)
            procText = remove_emoticon(procText)

         # Remove punctuation and other special characters
        if len(charsToRemove)>0:
            procText = re.sub(charsToRemove,' ',procText)

        # Remove numbers
        if removeNumbers:
            procText = re.sub(r'\d+',' ',procText)

        # Remove line breaks
        if removeLineBreaks:
            procText = procText.replace('\n',' ').replace('\r', '')

        # Remove special characters
        if len(specialCharsToRemove)>0:
            procText = re.sub(specialCharsToRemove,' ',procText)

        # Normalize to lower case
        if convertToLower:
            procText = procText.lower() 

        # Replace multiple consecutive spaces with just one space
        if removeConsecutiveSpaces:
            procText = re.sub(' +', ' ', procText)


        cleanedText.append(procText)


    return cleanedText

In [7]:
# Tokenize texts
def tokenize_words(texts):
    words_new = []
    for w in (texts[:]):
        w_token = word_tokenize(w)
        if w_token != '':
            words_new.append(w_token)
    return words_new

In [8]:
# Function to recreate text from words
def recreateText(words):
    text_new = []
    for w in (words[:]):
        temp_str = (' ').join(w)
        text_new.append(temp_str)
    return text_new

In [9]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    sentences_new = []
    for s in (texts[:]):
        s_token = sent_tokenize(s)
        sentences_new.append(s_token)
    return sentences_new

In [10]:
# Function to remove stop words
def removeStopWords(texts, stop_words):
  procText = []
  for t in (texts[:]):
    cleaned_text = [w for w in t[:] if not w in stop_words]
    procText.append(cleaned_text)
  return procText

### Analysis

In [11]:
# Create a dataframe with only the description
# Do not remove additional special characters and not convert to lower as they can make a difference in sentiment
ppText = textPreProcess(ds.text, removeLineBreaks=True, charsToRemove="", removeNumbers=True, convertToLower=False, convert_emojis=False, remove_emojis=False)
processedTweets =  pd.DataFrame(data=ppText, index=ds.index, columns=['PreProcessedText']) 

In [12]:
# Check first review
processedTweets

Unnamed: 0,PreProcessedText
0,Doesn't vaccine approval typically take and mo...
1,"You're right, too bad this vaccine doesn't hav..."
2,The vaccine is free
3,. the world needs COVID vaccine access now. Th...
4,But you can get it. I remember getting my Mump...
...,...
22529,The country need to check on people who take o...
22530,Why can you not go maskless with out vaccine? ...
22531,"You say the immunocomprimised ""should discuss ..."
22532,"Well, Kate, not everyone is vaccinated and the..."


In [13]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [14]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(processedTweets['PreProcessedText'][0])
print(processedTweets['PreProcessedText'][0],score)

Doesn't vaccine approval typically take and months? Just asking as I'm not too familiar with their process. {'neg': 0.138, 'neu': 0.862, 'pos': 0.0, 'compound': -0.3724}


In [15]:
# Process sentiment for all sentences
all_scores = []
for t in (processedTweets['PreProcessedText'][:]):
  score = analyser.polarity_scores(t)
  all_scores.append(score)
ds['Sentiment'] = [c['compound'] for c in all_scores]

In [16]:
ds

Unnamed: 0,text,user screen name,user followers,url,created at,replies,retweets,likes,mention,hashtag,Sentiment
0,Doesn't vaccine approval typically take 6 and ...,spadesgeek,19,https://mobile.twitter.com,2021-06-10 19:24:26.999999,0,0,0,"MatthewDavidH,EricTopol,TheEconomist,US_FDA,la...",,-0.3724
1,"You're right, too bad this vaccine doesn't hav...",selirodz,78,http://twitter.com/download/android,2021-06-10 19:24:26.000000,0,0,0,"DharkArk,JoeBiden",,-0.7353
2,The vaccine is free…❤️,FlowerGirlBaker,1486,http://twitter.com/#!/download/ipad,2021-06-10 19:23:57.000000,0,0,0,"theredshift11,POTUS,studentsfordemo",,0.5106
3,. the world needs COVID19 vaccine access ...,DevizesGreens,150,https://mobile.twitter.com,2021-06-10 19:23:49.000000,0,0,0,"BorisJohnson,JustinTrudeau,POTUS,EUCouncil,Reg...","COVID19,G7,EndThePandemic,COVAX",0.4215
4,But you can get it.\nI remember getting my Mum...,canfixstoopid,355,https://mobile.twitter.com,2021-06-10 19:23:42.000000,0,0,0,"SerendipityOr,Shockwave_Shaun,Ozymandiyaas,Joe...",,0.0000
...,...,...,...,...,...,...,...,...,...,...,...
22529,The country need to check on people who take o...,LSungoun,90,http://twitter.com/download/android,2021-05-16 04:11:06.000000,0,0,0,"POTUS,HillaryClinton,Jaemyung_Lee",,0.6898
22530,Why can you not go maskless with out vaccine? ...,bcgov115,38,http://twitter.com/download/iphone,2021-05-16 04:11:01.000000,0,0,0,POTUS,,0.0000
22531,"You say the immunocomprimised ""should discuss ...",dmdmdtweet,1,http://twitter.com/download/android,2021-05-16 04:08:22.999999,0,0,1,CDCgov,,-0.3987
22532,"Well, Kate, not everyone is vaccinated and the...",TuffCrusherPlus,83,http://twitter.com/download/iphone,2021-05-16 04:08:03.000000,0,0,0,"50treeK8,OregonGovBrown,CDCgov",,-0.4318


In [17]:
[]

[]

In [18]:
# Assign a qualitative evaluation to the review
bins = pd.IntervalIndex.from_tuples([(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed='right')
x = pd.cut(ds['Sentiment'].to_list(), bins)
x.categories = ['Negative','Neutral','Positive']
ds['Polarity'] = x

In [19]:
ds

Unnamed: 0,text,user screen name,user followers,url,created at,replies,retweets,likes,mention,hashtag,Sentiment,Polarity
0,Doesn't vaccine approval typically take 6 and ...,spadesgeek,19,https://mobile.twitter.com,2021-06-10 19:24:26.999999,0,0,0,"MatthewDavidH,EricTopol,TheEconomist,US_FDA,la...",,-0.3724,Negative
1,"You're right, too bad this vaccine doesn't hav...",selirodz,78,http://twitter.com/download/android,2021-06-10 19:24:26.000000,0,0,0,"DharkArk,JoeBiden",,-0.7353,Negative
2,The vaccine is free…❤️,FlowerGirlBaker,1486,http://twitter.com/#!/download/ipad,2021-06-10 19:23:57.000000,0,0,0,"theredshift11,POTUS,studentsfordemo",,0.5106,Positive
3,. the world needs COVID19 vaccine access ...,DevizesGreens,150,https://mobile.twitter.com,2021-06-10 19:23:49.000000,0,0,0,"BorisJohnson,JustinTrudeau,POTUS,EUCouncil,Reg...","COVID19,G7,EndThePandemic,COVAX",0.4215,Positive
4,But you can get it.\nI remember getting my Mum...,canfixstoopid,355,https://mobile.twitter.com,2021-06-10 19:23:42.000000,0,0,0,"SerendipityOr,Shockwave_Shaun,Ozymandiyaas,Joe...",,0.0000,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...
22529,The country need to check on people who take o...,LSungoun,90,http://twitter.com/download/android,2021-05-16 04:11:06.000000,0,0,0,"POTUS,HillaryClinton,Jaemyung_Lee",,0.6898,Positive
22530,Why can you not go maskless with out vaccine? ...,bcgov115,38,http://twitter.com/download/iphone,2021-05-16 04:11:01.000000,0,0,0,POTUS,,0.0000,Neutral
22531,"You say the immunocomprimised ""should discuss ...",dmdmdtweet,1,http://twitter.com/download/android,2021-05-16 04:08:22.999999,0,0,1,CDCgov,,-0.3987,Negative
22532,"Well, Kate, not everyone is vaccinated and the...",TuffCrusherPlus,83,http://twitter.com/download/iphone,2021-05-16 04:08:03.000000,0,0,0,"50treeK8,OregonGovBrown,CDCgov",,-0.4318,Negative


In [20]:
ds.to_excel(base_path + "Tweets_Sentiment.xlsx", index=False)