-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathtextfeatures.R
205 lines (185 loc) Β· 6.31 KB
/
textfeatures.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#' textfeatures
#'
#' Extracts features from text vector.
#'
#' @param text Input data. Should be character vector or data frame with character
#' variable of interest named "text". If a data frame then the first "id|*_id"
#' variable, if found, is assumed to be an ID variable.
#' @param sentiment Logical, indicating whether to return sentiment analysis
#' features, the variables \code{sent_afinn} and \code{sent_bing}. Defaults to
#' TRUE. Setting this to FALSE will speed things up a bit.
#' @param word_dims Integer indicating the desired number of word2vec dimension
#' estimates. When NULL, the default, this function will pick a reasonable
#' number of dimensions (ranging from 2 to 200) based on size of input. To
#' disable word2vec estimates, set this to 0 or FALSE.
#' @param normalize Logical indicating whether to normalize (mean center,
#' sd = 1) features. Defaults to TRUE.
#' @param newdata If a textfeatures_model is supplied to text, supply this with
#' new data to which you would like to apply the textfeatures_model.
#' @param verbose A single logical for printing logging messages as work
#' progresses.
#' @return A tibble data frame with extracted features as columns.
#' @examples
#'
#' ## the text of five of Trump's most retweeted tweets
#' trump_tweets <- c(
#' "#FraudNewsCNN #FNN https://t.co/WYUnHjjUjg",
#' "TODAY WE MAKE AMERICA GREAT AGAIN!",
#' paste("Why would Kim Jong-un insult me by calling me \"old,\" when I would",
#' "NEVER call him \"short and fat?\" Oh well, I try so hard to be his",
#' "friend - and maybe someday that will happen!"),
#' paste("Such a beautiful and important evening! The forgotten man and woman",
#' "will never be forgotten again. We will all come together as never before"),
#' paste("North Korean Leader Kim Jong Un just stated that the \"Nuclear",
#' "Button is on his desk at all times.\" Will someone from his depleted and",
#' "food starved regime please inform him that I too have a Nuclear Button,",
#' "but it is a much bigger & more powerful one than his, and my Button",
#' "works!")
#' )
#'
#' ## get the text features of a character vector
#' textfeatures(trump_tweets)
#'
#' ## data frame with a character vector named "text"
#' df <- data.frame(
#' id = c(1, 2, 3),
#' text = c("this is A!\t sEntence https://github.com about #rstats @github",
#' "and another sentence here",
#' "The following list:\n- one\n- two\n- three\nOkay!?!"),
#' stringsAsFactors = FALSE
#' )
#'
#' ## get text features of a data frame with "text" variable
#' textfeatures(df)
#'
#' @export
textfeatures <- function(text,
sentiment = TRUE,
word_dims = NULL,
normalize = TRUE,
newdata = NULL,
verbose = TRUE) {
UseMethod("textfeatures")
}
#' @export
textfeatures.character <- function(text,
sentiment = TRUE,
word_dims = NULL,
normalize = TRUE,
newdata = NULL,
verbose = TRUE) {
## validate inputs
stopifnot(
is.character(text),
is.logical(sentiment),
is.atomic(word_dims),
is.logical(normalize)
)
## initialize output data
if (verbose)
tfse::print_start("Counting features in text...")
o <- tweet_features(text)
## length
n_obs <- length(text)
## tokenize into words
text <- prep_wordtokens(text)
## estimate sentiment
if (sentiment) {
if (verbose)
tfse::print_start("Sentiment analysis...")
o$sent_afinn <- sentiment_afinn(text)
o$sent_bing <- sentiment_bing(text)
o$sent_syuzhet <- sentiment_syuzhet(text)
o$sent_vader <- sentiment_vader(text)
o$n_polite <- politeness(text)
}
## parts of speech
if (verbose)
tfse::print_start("Parts of speech...")
o$n_first_person <- first_person(text)
o$n_first_personp <- first_personp(text)
o$n_second_person <- second_person(text)
o$n_second_personp <- second_personp(text)
o$n_third_person <- third_person(text)
o$n_tobe <- to_be(text)
o$n_prepositions <- prepositions(text)
## get word dim estimates
if (verbose)
tfse::print_start("Word dimensions started")
w <- estimate_word_dims(text, word_dims, n_obs)
## convert 'o' into to tibble and merge with w
o <- tibble::as_tibble(o)
o <- dplyr::bind_cols(o, w)
## make exportable
m <- vapply(o, mean, na.rm = TRUE, FUN.VALUE = numeric(1))
s <- vapply(o, stats::sd, na.rm = TRUE, FUN.VALUE = numeric(1))
e <- list(avg = m, std_dev = s)
e$dict <- attr(w, "dict")
## normalize
if (normalize) {
if (verbose)
tfse::print_start("Normalizing data")
o <- scale_normal(scale_count(o))
}
## store export list as attribute
attr(o, "tf_export") <- structure(e,
class = c("textfeatures_model", "list")
)
## done!
if (verbose)
tfse::print_complete("Job's done!")
## return
o
}
#' @export
textfeatures.factor <- function(text,
sentiment = TRUE,
word_dims = NULL,
normalize = TRUE,
newdata = newdata,
verbose = TRUE) {
textfeatures(
as.character(text),
sentiment = sentiment,
word_dims = word_dims,
normalize = normalize,
newdata = newdata,
verbose = verbose
)
}
#' @export
textfeatures.data.frame <- function(text,
sentiment = TRUE,
word_dims = NULL,
normalize = TRUE,
newdata = newdata,
verbose = TRUE) {
## validate input
stopifnot("text" %in% names(text))
textfeatures(
text$text,
sentiment = sentiment,
word_dims = word_dims,
normalize = normalize,
newdata = newdata,
verbose = verbose
)
}
#' @export
textfeatures.list <- function(text,
sentiment = TRUE,
word_dims = NULL,
normalize = TRUE,
newdata = newdata,
verbose = TRUE) {
## validate input
stopifnot("text" %in% names(text))
textfeatures(
text$text,
sentiment = sentiment,
word_dims = word_dims,
normalize = normalize,
newdata = newdata,
verbose = verbose
)
}