In [71]:
# Title:  Sentiment Analysis: Binary Classification

# INSTALL AND LOAD PACKAGES ################################

# Install pacman if you don't have it (uncomment next line)
# install.packages("pacman")

# Install and/or load packages with pacman
pacman::p_load(  # Use p_load function from pacman
  gutenbergr,    # Import Project Gutenberg texts
  magrittr,      # Pipes
  pacman,        # Load/unload packages
  rio,           # Import/export data
  tidytext,      # Text functions
  tidyverse      # So many reasons 
)

In [72]:
# IMPORT DATA ##############################################

# Download "The Iliad" by Homer from Project Gutenberg. The
# `gutenberg_download` function can access books by ID,
# which you can get from the Gutenberg website,
# https://www.gutenberg.org. (The book's page is
# http://www.gutenberg.org/ebooks/6150)

# Import local file and save as tibble
df <- import("../data/Iliad.txt") %>% as_tibble()

# Look at the first few rows
df

gutenberg_id,text
<int>,<chr>
6150,The Iliad
6150,
6150,by Homer
6150,
6150,Rendered into English Blank verse by
6150,"Edward, Earl of Derby"
6150,
6150,
6150,
6150,


In [73]:
# PREPARE DATA #############################################

#text_df<-tokenize_words(as.character(df))

# Tokenize the data
df %<>%              # Overwrite data
  select(text)       # Keep text only
  
df %<>% unnest_tokens(     # Separate the text
  word,                    # Split text into words
  text                     # Save in text format
  )

df

word
<chr>
the
iliad
by
homer
rendered
into
english
blank
verse
by


In [74]:
# See the tokens/words by frequency
df %>% count(word, sort = TRUE) 

word,n
<chr>,<int>
the,9901
and,5543
of,3944
to,3558
his,2841
in,2408
he,2018
with,1947
from,1256
a,1246


In [75]:
# Remove "stop words" like "the," "to," "a," and so on.
df %<>% anti_join(stop_words)

Joining, by = "word"



In [76]:
# See the revised tokens/words by frequency
df %>% count(word, sort = TRUE) 

word,n
<chr>,<int>
son,867
thou,757
thy,688
greeks,530
achilles,445
hector,429
ships,422
thee,421
jove,392
hand,387


In [77]:
# CATEGORIZE SENTIMENTS ####################################

# Find positive and negative words with the "bing" lexicon
df %>%
  inner_join(                # Match words with sentiments
    get_sentiments("bing")   # Use "bing" sentiment library
  ) 

Joining, by = "word"



word,sentiment
<chr>,<chr>
modern,positive
success,positive
sufficiently,positive
encourage,positive
urgent,negative
regret,negative
sincere,positive
diffidence,negative
success,positive
ordeal,negative


In [78]:
# Sort sentiment words by frequency
df %>%
  inner_join(                # Match words with sentiments
    get_sentiments("bing")   # Use "bing" sentiment library
  ) %>% 
  count(word, sort = TRUE) 

Joining, by = "word"



word,n
<chr>,<int>
death,236
brave,165
fell,164
mighty,163
valiant,160
bore,152
fair,152
godlike,144
noble,141
struck,126


In [79]:
# Summarize the number (and proportion) of sentiments
df %>%
  inner_join(                # Match words with sentiments
    get_sentiments("bing")   # Use "bing" sentiment library
  ) %>% 
  group_by(sentiment) %>%    # Group sentiments
  summarize(n = n()) %>%     # Count number of words
  mutate(prop = n / sum(n))  # Get proportions of total

Joining, by = "word"



Unnamed: 0_level_0,sentiment,n,prop
Unnamed: 0_level_1,<chr>,<int>,<dbl>
1,negative,6385,0.6028704
2,positive,4206,0.3971296
