# Lexicons

In [None]:
library(tidytext)
library(tm)
library(dplyr)
library(textdata)
library("ggplot2")

In [None]:
# Load functions from a file
source(here::here('functions.R'))

In [None]:
# Load the lexicon
# "afinn", "bing", "nrc", "loughran"
bing <- get_sentiments(lexicon = c("bing"))
head(bing)

In [None]:
# Define a word
term <- "good"

# Use lexicon to get the sentiment of a word with bing
bing[bing$word == term,"sentiment"]

# News sentiment analysis

In [None]:
# Load the news data
news <- read.csv('data/NewsSample.csv')
head(news,2)

In [None]:
# Create custom stop words
stops <- c("and", "or", "but", "not", "is", "are", "the", "a", "an",
           "this", "that", "it", "its", "he", "she", "they", "them",
           "we", "us", "you", "your")

# Load and Organize News Data
txtDTM <- cleanMatrix("data/NewsSample.csv",
                      'text',
                      collapse        = F,  
                      customStopwords = stops,
                      type            = 'DTM', 
                      wgt             = 'weightTf')

# Turn to a matrix & examine
txtDTMmatrix <- as.matrix(txtDTM)
txtDTMmatrix[,1:10]
dim(txtDTMmatrix)

In [None]:
# Convert to a tidy format
tidyCorp <- tidy(txtDTM)
tidyCorp
dim(tidyCorp)

In [None]:
# Perform Inner Join 
# (find common words between the tidy corpus and the bing lexicon)
bingSent <- inner_join(tidyCorp, bing, by=c('term' = 'word'))
bingSent

In [None]:
# Histogram of sentiment for each document
ggplot(bingSent, aes(x=sentiment)) +
  geom_bar(stat='count') +
  facet_wrap(~document) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Sentiment Analysis of News Sample",
       x = "Sentiment",
       y = "Count")

In [None]:
# Quick Analysis
aggregate(count~sentiment,bingSent, sum)

In [None]:
# Final sentiment / our custom polarity
# (positive - negative) / (positive + negative)

# Task: fill in the numbers


# Calculate sentiment per document

In [None]:
# filter bigSent by document
bingSentDoc <- bingSent %>%
  group_by(document) %>%
  summarise(pos = sum(count[sentiment == "positive"]),
            neg = sum(count[sentiment == "negative"]),
            net = (pos - neg) / (pos + neg))

# convert bingSentDoc$document to a numeric value
bingSentDoc$document <- as.numeric(bingSentDoc$document)
# sort bingSentDoc by document
bingSentDoc <- bingSentDoc[order(bingSentDoc$document),]
  
  
# Add sentiment values to news dataset 
news$bingpos <- bingSentDoc$pos
news$bingneg <- bingSentDoc$neg
news$bingsent <- (news$bingpos - news$bingneg) / (news$bingpos + news$bingneg)


# Check the results
head(news[,-c(2)])

# AFINN Sentiment Lexicon

In [None]:
# Get afinn lexicon
# afinn<-get_sentiments(lexicon = c("afinn"))

# Load afinn lexicon from file
afinn <- read.csv("data/afinn.csv")
head(afinn)

In [None]:
# Perform Inner Join
afinnSent <- inner_join(tidyCorp,afinn, by=c('term' = 'word'))
afinnSent

In [None]:
# Histogram of sentiment for each document
ggplot(afinnSent, aes(x=value)) +
  geom_bar(stat='count') +
  facet_wrap(~document) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Sentiment Analysis of News Sample",
       x = "Sentiment",
       y = "Count")

In [None]:
# Quick Analysis
aggregate(count~value,afinnSent, sum)

In [None]:
# VADER lexicon

In [None]:
library(vader)

In [None]:
# Apply VADER lexicon 
vader_sent = vader_df(news$text, incl_nt = T, neu_set = T, rm_qm = F)

In [None]:
# Look at the results
head(vader_sent,2)

In [None]:
# Add vader sentiment to news dataset
news$vaderpos <- vader_sent$pos
news$vaderneg <- vader_sent$neg
news$vadercompound <- vader_sent$compound
news$vadersent <- (news$vaderpos - news$vaderneg) / (news$vaderpos + news$vaderneg)

In [None]:
# Plot histogram of vader sentiment
ggplot(news, aes(x=vadersent)) +
  geom_histogram(binwidth = 0.1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Histogram of VADER Sentiment",
       x = "Sentiment",
       y = "Count")

In [None]:
# Histogram of vader compound sentiment
ggplot(news, aes(x=vadercompound)) +
  geom_histogram(binwidth = 0.1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Histogram of VADER compound Sentiment",
       x = "Sentiment",
       y = "Count")

In [None]:
# Add histogram of bing sentiment
ggplot(news, aes(x=bingsent)) +
  geom_histogram(binwidth = 0.1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Histogram of Bing Sentiment",
       x = "Sentiment",
       y = "Count")

# Emotions  
# NRC Word-Emotion Association Lexicon (aka EmoLex)

In [None]:
library(lexicon)
library(tidyr)

In [None]:
# Get the nrc lexicon (only 8 emotions)
nrc <- nrc_emotions

# Pivot the data for joining 
nrcLex <- pivot_longer(nrc, c(-term))
nrcLex <- subset(nrcLex, nrcLex$value>0)
nrcLex$value <- NULL

In [None]:
# Check the nrc lexicon
head(nrcLex)

In [None]:
# inner join with the nrc lexicon (find common terms from data and lexicon)
nrcSent <- inner_join(tidyCorp,nrcLex, by=c('term' = 'term'))

In [None]:
# Quick Analysis
table(nrcSent$name)
emos <- data.frame(table(nrcSent$name))

In [None]:
library(radarchart)
chartJSRadar(scores = emos, labelSize = 10, showLegend = F)

# Twitter sentiment analysis

Homework: Try to load dataset TweetsSample.csv from folder data and try to do the same analysis as above