Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
119 lines (92 sloc) 3.06 KB
library(tidyverse)
library(tm)
library(syuzhet)
library(wordcloud2)
library(plotly)
library(stringr)
library(tidytext)
# load .txt file
### replace 'YOUR_TXT_FILE' with your file path
text_df <- tibble(text = readLines('YOUR_TXT_FILE')) %>%
slice(4:n()) %>%
mutate(time = as.Date(str_match(text, "(?:(?!,).)*"), "%m/%d/%y"),
name = str_extract(as.character(str_match(text,"-.*:")), "[A-Za-z']+"),
body = as.character(str_match(text, "[^:]*$"))) %>%
select(-text)
text_df
# tokenize data and remove NA's
text_df <- text_df %>%
unnest_tokens(word,body) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
na.omit()
text_df
# create vector of custom stop words and remove
custom_stop <- bind_rows(data_frame(word = c("gina","mom","dad","bob","media",
"day", "omitted","lou","pat","patrick",
"yep","ana","aixgivfkuaq","parker's"),
lexicon = c("custom")),
stop_words)
text_df <- text_df %>% anti_join(custom_stop)
text_df
# top 10 words by person
text_df %>%
count(name,word, sort = TRUE) %>%
arrange(name, -n) %>%
group_by(name) %>%
mutate(rank = seq(1:length(name))) %>%
ungroup() %>%
filter(rank <= 5) %>%
mutate(word = reorder(word, n)) %>%
arrange(name, -n) %>%
ggplot(aes(word, n, fill = name)) +
geom_col(show.legend = FALSE) +
facet_wrap(~name, ncol = 1, scales = "free_y") +
labs(y = NULL, x = NULL, title = "Top 5 Words by Person") +
coord_flip() +
theme(axis.text = element_text(size = 12),
strip.text.x = element_text(size = 20))
ggsave("top5.png")
######################
# SENTIMENT ANALYSIS #
######################
# calculate overall sentiment scores: afinn
text_df %>%
group_by(name) %>%
inner_join(get_sentiments("afinn")) %>%
summarise(total = mean(score)) %>%
mutate(name = reorder(name, total)) %>%
arrange(desc(total)) %>%
ggplot(aes(name, total, fill = name)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme(axis.text = element_text(size = 14))
ggsave("overall.png")
# create tibble for visualizations
sent <- text_df %>%
inner_join(get_sentiments("afinn")) %>%
group_by(time, name) %>%
mutate(sentiment = mean(score))
p <- sent %>%
ggplot(aes(as.Date(time), sentiment)) +
geom_point(aes(colour = as.factor(name)), alpha = .3) +
geom_smooth(aes(colour = as.factor(name)), se = FALSE) +
facet_wrap(~name, ncol = 2) +
labs(x = "", y = "") +
theme(axis.text = element_text(size = 14),
legend.position = "none",
strip.text.x = element_text(size = 16))
ggplotly(p)
p2 <- sent %>%
ggplot(aes(as.Date(time), sentiment)) +
geom_point(aes(colour = as.factor(name)), alpha = .15) +
geom_smooth(aes(colour = as.factor(name)), se = FALSE) +
labs(x = "Month", y = "Mean Daily Sentiment Score") +
theme(axis.text = element_text(size = 16))
ggplotly(p2)
ggsave("p2.png")
##############
# wordclouds #
##############
text_df %>%
count(word) %>%
wordcloud2(shape = "circle")