/
Emily_bronte.R
73 lines (66 loc) · 2.72 KB
/
Emily_bronte.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#loading required packages
library(gutenbergr)
library(dplyr)
library(tidytext)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(viridis)
library(ggthemes)
#downloading UTF-8 text file from project gutenberg using ID number
Emily_Bronte <- gutenberg_download(768)
#converting each row into tokens
Tidy_Bronte <- Emily_Bronte %>%
unnest_tokens(word,text) %>% #split into individual words
anti_join(stop_words) # discarding stop words such as the, of, if so on
#visualizing word frequencies
Tidy_Bronte %>%
count(word, sort = TRUE) %>% # counting each word frequency
filter(n > 100) %>% # filter with frequency > 100
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n),fill()) + #plot
geom_col(aes(fill=word), color = "black", size = .7,
show.legend = FALSE) + # each bar
xlab(NULL) + #xtitle
ylab("Frequency")+ #ytitle
coord_flip()+ #flip x and y
labs(title = "Word frequency in \"Wuthering Heights\"")+ #plot title
scale_fill_viridis(option = "magma", discrete = TRUE) + #custom theme and coloring
theme_pander(base_size = 18, pc = "white")
#using nrc lexicon for sentiment analysis
nrc_emotions <- get_sentiments("nrc") %>% # filtering only 4 out of all the available emotions
filter(sentiment == "joy" |
sentiment == "anger" |
sentiment == "fear" |
sentiment == "sadness")
TB_emotions <- Tidy_Bronte %>%
inner_join(nrc_emotions) %>% # matching words with sentiments
count(word, sentiment) %>% # count the words associated with sentiments
arrange(sentiment) #order by sentiments
#visualizing using nrc lexicon
TB_emotions %>%
group_by(sentiment) %>% #grouping words with emotions
top_n(10) %>% # filter the top 10
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) + #plot
geom_col(color = "black", size = .7, show.legend = FALSE)+ #bars
labs(
title = "Top 10 words clustered by sentiment",
x = NULL,
y = "number of occurrences"
) + #title of the plot and axis
scale_fill_viridis(option = "viridis", discrete = TRUE) +
facet_wrap(~sentiment, scales = "free_y") +
coord_flip() + # flip axes
theme_pander(base_size = 18, pc = "white") #custom colors and themes
# visualizing wordcloud package
Tidy_Bronte %>%
inner_join(get_sentiments("bing")) %>% #import bing sentiments
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(50) %>% # top 50 words
ungroup() %>%
mutate(word = reorder(word, n)) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>% # cloud.comparison requires matrix representation
comparison.cloud(colors = c("#ab3400", "#469280")) # cloud comparison of word occurance