Skip to content

Commit

Permalink
porter helper function + tests
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnCoene committed Aug 23, 2019
1 parent 483b9b4 commit 5ab8981
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 1 deletion.
2 changes: 2 additions & 0 deletions NAMESPACE
Expand Up @@ -76,6 +76,7 @@ S3method(similarity_matrix,wrapped)
S3method(split_alphanum,character)
S3method(split_alphanum,data.frame)
S3method(split_alphanum,list)
S3method(stem_porter,porter_stemmer_model)
S3method(stem_text,character)
S3method(stem_text,data.frame)
S3method(stem_text,list)
Expand Down Expand Up @@ -187,6 +188,7 @@ export(sklearn_text2bow)
export(sklearn_tfidf)
export(sklearn_word2vec)
export(split_alphanum)
export(stem_porter)
export(stem_text)
export(strip_multiple_spaces)
export(strip_non_alphanum)
Expand Down
25 changes: 24 additions & 1 deletion R/parsing.R
Expand Up @@ -5,20 +5,43 @@
#' from words in English. Its main use is as part of a term normalisation
#' process that is usually done when setting up Information Retrieval systems.
#'
#' @param stemmer A porter stemmer as returned by \code{\link{porter_stemmer}}.
#' @param text Text to parse.
#'
#' @examples
#' \dontrun{
#' # create model
#' stemmer <- porter_stemmer()
#'
#' # stem
#' stemmer$stem("survey")
#' # or convenience method
#' stem_porter(stemmer, "survey")
#' }
#'
#' @name porter_stemmer
#'
#' @export
porter_stemmer <- function() {
model <- gensim$parsing$porter$PorterStemmer()
model <- structure(model, class = c("porter_stemmer_model", class(model)))
invisible(model)
}

#' @rdname porter_stemmer
#' @export
stem_porter <- function(stemmer, text) UseMethod("stem_porter")

#' @rdname porter_stemmer
#' @method stem_porter porter_stemmer_model
#' @export
stem_porter.porter_stemmer_model <- function(stemmer, text){
assert_that(!missing(text), msg = "Missing `text`.")

purrr::map(text, stemmer$stem) %>%
purrr::map(reticulate::py_to_r) %>%
unlist()
}

#' Remove stopwords
#'
#' Remove stopwords from a character string.
Expand Down
16 changes: 16 additions & 0 deletions man/porter_stemmer.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions tests/testthat/test-corpora.R
Expand Up @@ -135,6 +135,10 @@ test_that("porter_stemmer and stem_words words", {
reticulate::py_to_r()
expect_equal(stemmed_porter, expected)

# convenienve function
stemmed_conv <- stem_porter(stemmer, word)
expect_equal(stemmed_porter, stemmed_conv)

# stem
stemmed <- stem_text(word)
expect_equal(stemmed, expected)
Expand Down

0 comments on commit 5ab8981

Please sign in to comment.