/
pack.R
90 lines (89 loc) · 2.63 KB
/
pack.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#' Pack a data.frame of tokens
#'
#' Packs a data.frame of tokens into a new data.frame of corpus,
#' which is compatible with the Text Interchange Formats.
#'
#' @section Text Interchange Formats (TIF):
#'
#' The Text Interchange Formats (TIF) is a set of standards
#' that allows R text analysis packages to target defined inputs and outputs
#' for corpora, tokens, and document-term matrices.
#'
#' @section Valid data.frame of tokens:
#'
#' The data.frame of tokens here is a data.frame object
#' compatible with the TIF.
#'
#' A TIF valid data.frame of tokens are expected to have one unique key column (named `doc_id`)
#' of each text and several feature columns of each tokens.
#' The feature columns must contain at least `token` itself.
#'
#' @seealso \url{https://github.com/ropenscilabs/tif}
#'
#' @param tbl A data.frame of tokens.
#' @param pull <[`data-masked`][rlang::args_data_masking]>
#' Column to be packed into text or ngrams body. Default value is `token`.
#' @param n Integer internally passed to ngrams tokenizer function
#' created of \code{gibasa::ngram_tokenizer()}
#' @param sep Character scalar internally used as the concatenator of ngrams.
#' @param .collapse This argument is passed to \code{stringi::stri_c()}.
#' @returns A tibble.
#' @export
#' @examples
#' \dontrun{
#' df <- tokenize(
#' data.frame(
#' doc_id = seq_along(ginga[5:8]),
#' text = ginga[5:8]
#' )
#' )
#' pack(df)
#' }
pack <- function(tbl, pull = "token", n = 1L, sep = "-", .collapse = " ") {
pull <- enquo(pull)
if (n < 2L) {
tbl %>%
dplyr::reframe(
text = .data[[pull]] %>%
stringi::stri_remove_empty_na() %>%
stringi::stri_c(collapse = .collapse),
.by = "doc_id"
) %>%
dplyr::as_tibble()
} else {
make_ngram <- ngram_tokenizer(n)
tbl %>%
dplyr::reframe(
text = .data[[pull]] %>%
stringi::stri_remove_empty_na() %>%
make_ngram(sep = sep) %>%
stringi::stri_c(collapse = .collapse),
.by = "doc_id"
) %>%
dplyr::as_tibble()
}
}
#' Ngrams tokenizer
#'
#' Makes an ngram tokenizer function.
#'
#' @param n Integer.
#' @returns ngram tokenizer function
#' @export
#' @examples
#' bigram <- ngram_tokenizer(2)
#' bigram(letters, sep = "-")
ngram_tokenizer <- function(n = 1L) {
stopifnot(is.numeric(n), is.finite(n), n > 0)
function(tokens, sep = " ") {
stopifnot(is.character(tokens))
len <- length(tokens)
if (all(is.na(tokens)) || len < n) {
character(0)
} else {
sapply(seq_len(max(1, len - n + 1)), function(i) {
stringi::stri_join(tokens[i:min(len, i + n - 1)], collapse = sep)
})
}
}
}