pushed spreadsheet fns to master

luisDVA · Oct 6, 2019 · bf91276 · bf91276
1 parent 391f724
commit bf91276
Show file tree

Hide file tree

Showing 15 changed files with 154 additions and 31 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,7 +17,9 @@ Imports:
     forcats,
     stringr,
     tidyr,
-    magrittr
+    magrittr,
+    tidyxl,
+    readxl
 RoxygenNote: 6.1.1
 Suggests: 
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(annotate_mf)
 export(unbreak_vals)
 export(untangle2)
 export(unwrap_cols)

diff --git a/NEWS.md b/NEWS.md
@@ -3,4 +3,6 @@
 * Added a `NEWS.md` file to track changes to the package.
 * Added a `.slice_groups` argument to `unbreak_vals` to either keep or discard the variable with broken values and slice the extra rows afterwards. 
 
-* Reworded all the documentation with more adecuate terms (i.e. embedded subheaders).
+* Reworded all the documentation with more adequate terms (i.e. embedded subheaders).
+
+* Added spreadsheet-related functions.
diff --git a/R/annotate_mf.R b/R/annotate_mf.R
@@ -0,0 +1,63 @@
+#' Annotate meaningful formatting
+#'
+#' @param xlfilepath Path to spreadsheet file (xls or xlsx).
+#' @param orig Variable to annotate formatting in.
+#' @param new Name of new variable with cell formatting pasted as a string.
+#'
+#' @return A tibble with a new column with meaningful formatting embedded as
+#'   text.
+#' @details At this point, only four popular approaches for meaningful formatting
+#'   (bold, italic, underline, cell highlighting) are hardcoded in the function.
+#' @examples
+#' example_spreadsheet <- system.file("extdata/dog_test.xlsx", package = "unheadr")
+#' annotate_mf(example_spreadsheet,orig = Task, new=Task_annotated)
+#'
+#' @export
+annotate_mf <- function(xlfilepath, orig, new) {
+  orig <- dplyr::enquo(orig) # tidyeval
+  new <- dplyr::enquo(new) # tidyeval
+
+  spsheet <- readxl::read_excel(xlfilepath)
+  m_formatting <- tidyxl::xlsx_cells(xlfilepath)
+  format_defs <- tidyxl::xlsx_formats(xlfilepath)
+
+  # meaningful formatting
+  bolded <- format_defs$local$font$bold
+  italic <- format_defs$local$font$italic
+  underlined <- format_defs$local$font$underline
+  highlighted <- format_defs$local$fill$patternFill$patternType
+  format_opts <- tibble::lst(bolded, italic, highlighted, underlined)
+  formatting_indicators <- dplyr::bind_cols(lapply(format_opts, function(x) x[m_formatting$local_format_id]))
+  format_joined <- dplyr::bind_cols(m_formatting, formatting_indicators)
+  col_orig <- format_joined$col[match(paste0(rlang::as_name(orig)), format_joined$character)]
+
+  # target variable
+  orig_format <- dplyr::filter(format_joined, row >= 2 & col == col_orig)
+  orig_format <- dplyr::select(orig_format, bolded:underlined)
+  formatted <- dplyr::bind_cols(spsheet, orig_format)
+  formatted <- dplyr::mutate_at(
+    formatted, dplyr::vars(bolded:underlined),
+    ~ replace(., is.na(.), FALSE)
+  )
+  formatted$highlighted <- gsub(pattern = "[^FALSE].*", replacement = "TRUE", formatted$highlighted)
+  formatted <- dplyr::mutate_at(formatted, dplyr::vars(bolded:underlined), as.logical)
+  # swap na with variable names
+  indx <- which(formatted == TRUE, arr.ind = TRUE)
+  formatted[indx] <- names(formatted)[indx[, 2]]
+  formatted <- dplyr::mutate_at(formatted, dplyr::vars(bolded:underlined), ~ replace(., . == "FALSE", ""))
+  # build annotation strings
+  formatted <- dplyr::mutate(formatted, newvar = paste(bolded, italic, highlighted, underlined))
+  formatted$newvar <- stringr::str_squish(formatted$newvar)
+  formatted$newvar <- gsub(" ", ", ", formatted$newvar)
+  formatted <- dplyr::select(formatted, -c(bolded:underlined))
+  formatted <- dplyr::mutate(
+    formatted,
+    !!new := ifelse(test = newvar != "",
+      yes = paste0("(", newvar, ") ", !!orig),
+      no = !!orig
+    )
+  )
+  formatted$newvar <- NULL
+  formatted <- dplyr::select(formatted, !!orig, !!new, dplyr::everything())
+  formatted
+}
diff --git a/R/unbreak_fn.R b/R/unbreak_fn.R
@@ -1,4 +1,4 @@
-#' Unbreak values using regex to match the broken half of the value.
+#' Unbreak values using regex to match the broken half of the value
 #'
 #' @param df A data frame with one or more values within a variable broken up
 #'   across two rows.

diff --git a/R/untangle_fn.R b/R/untangle_fn.R
@@ -1,4 +1,4 @@
-#' Rectangling embedded subheaders.
+#' Rectangling embedded subheaders
 #'
 #' @param df A data frame with embedded subheaders.
 #' @param regex Regular expression to match the subheaders.

diff --git a/R/unwrap_fn.R b/R/unwrap_fn.R
@@ -1,4 +1,4 @@
-#' Unwrap values and clean up NAs used as padding.
+#' Unwrap values and clean up NAs used as padding
 #'
 #' @param df A data frame with wrapped values and an inconsistent number of NA
 #'   values used to as within-group padding.

diff --git a/README.Rmd b/README.Rmd
@@ -14,7 +14,8 @@ knitr::opts_chunk$set(
 ```
 
 # unheadr <img src="man/figures/logosmall.png" align="right" />
-The goal of unheadr is to help wrangle data when it has embedded subheaders, or when values are wrapped across several rows.
+The goal of `unheadr` is to help wrangle data when it has embedded subheaders, or when values are wrapped across several rows.
+
 
 ## Installation
 
@@ -31,19 +32,18 @@ Verde Arregoitia, L. D., Cooper, N., D'Elía, G. (2018). Good practices for shar
 
 ## Usage
 
-**unheadr() function**  
+**untangle2() function**  
 
-The star of the package. Puts embedded subheaders into their own variable. 
+The star of the package. Puts embedded subheaders into their own variable, using regular expressions to identify them. 
 
 ``` r
 data(primates2017)
 # head(primates2017,n=20)
 ```
 
-The first half of the dataset looks like the table below. Note that there are rows that correspond to values in grouping variables, which should be in their own column. Instead, they are embedded within the data rectangle. This is a pretty common practice. In formatted tables and spreadsheets, this information is often centered and merged and shown with highlighting or and font formatting. This looks nice and is easy to read, but hard to work with (for example: counting elements or calculating group-wise summaries). 
-
+The first half of the dataset looks like the table below. Note that there are rows that correspond to values in grouping variables, which should be in their own column. Instead, they are embedded within the data rectangle. This is a pretty common practice. This looks OK and is easy to read, but hard to work with (for example: calculating group-wise summaries). 
 
-In this example, values for an implicit 'geographic region' variable and an implicit 'taxonomic family' variabble are embedded in the column that contains our observational units (the scientific names of various primates).  
+In this example, values for an implicit 'geographic region' variable and an implicit 'taxonomic family' variable are embedded in the column that contains the observational units (the scientific names of various primates).  
 
 |scientific_name              |common_name                  |red_list_status | mass_kg|
 |:----------------------------|:----------------------------|:---------------|-------:|
@@ -111,7 +111,7 @@ primates2017 %>%
 |Allocebus trichotis          |Hairy-eared Dwarf Lemur      |VU              |    0.09|CHEIROGALEIDAE  |Madagascar |
 |Microcebus tavaratra         |Tavaratra Mouse Lemur        |VU              |    0.06|CHEIROGALEIDAE  |Madagascar |
 
-Now we can easily perform grouping operations and summarize the data (e.g.: calculating average body mass by Family).
+Now we can easily perform grouping operations and summarize the data (e.g. calculating average body mass by Family).
 
 
 At this point, refer to the links in the vignette and the function help for more information and examples on the use of the other helper functions. 
@@ -163,3 +163,13 @@ Paste the wrapped elements, separating with commas.
 unwrap_cols(nyk, groupingVar = player, separator = ", ")
 ```
 
+**annotate_mf() function**
+
+Sometimes embedded subheaders can't be matched by content or context, but they share the same formatting in a spreadsheet file. 
+
+The `annotate_mf()` function flattens four common approaches to confer meaningful formatting to cells and adds this as a character string to the target variable.
+
+``` r
+example_spreadsheet <- system.file("extdata/dog_test.xlsx", package = "unheadr")
+annotate_mf(example_spreadsheet,orig = Task, new=Task_annotated)
+```
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 
 # unheadr <img src="man/figures/logosmall.png" align="right" />
 
-The goal of unheadr is to help wrangle data when it has embedded
+The goal of `unheadr` is to help wrangle data when it has embedded
 subheaders, or when values are wrapped across several rows.
 
 ## Installation
@@ -27,10 +27,10 @@ DOI 10.4404/hystrix-00133-2018](https://doi.org/10.4404/hystrix-00133-2018)
 
 ## Usage
 
-**unheadr() function**
+**untangle2() function**
 
 The star of the package. Puts embedded subheaders into their own
-variable.
+variable, using regular expressions to identify them.
 
 ``` r
 data(primates2017)
@@ -40,15 +40,13 @@ data(primates2017)
 The first half of the dataset looks like the table below. Note that
 there are rows that correspond to values in grouping variables, which
 should be in their own column. Instead, they are embedded within the
-data rectangle. This is a pretty common practice. In formatted tables
-and spreadsheets, this information is often centered and merged and
-shown with highlighting or and font formatting. This looks nice and is
-easy to read, but hard to work with (for example: counting elements or
-calculating group-wise summaries).
+data rectangle. This is a pretty common practice. This looks OK and is
+easy to read, but hard to work with (for example: calculating group-wise
+summaries).
 
 In this example, values for an implicit ‘geographic region’ variable and
-an implicit ‘taxonomic family’ variabble are embedded in the column that
-contains our observational units (the scientific names of various
+an implicit ‘taxonomic family’ variable are embedded in the column that
+contains the observational units (the scientific names of various
 primates).
 
 | scientific\_name             | common\_name                 | red\_list\_status | mass\_kg |
@@ -125,7 +123,7 @@ primates2017 %>%
 | Microcebus tavaratra         | Tavaratra Mouse Lemur        | VU                |     0.06 | CHEIROGALEIDAE  | Madagascar |
 
 Now we can easily perform grouping operations and summarize the data
-(e.g.: calculating average body mass by Family).
+(e.g. calculating average body mass by Family).
 
 At this point, refer to the links in the vignette and the function help
 for more information and examples on the use of the other helper
@@ -183,3 +181,18 @@ Paste the wrapped elements, separating with commas.
 ``` r
 unwrap_cols(nyk, groupingVar = player, separator = ", ")
 ```
+
+**annotate\_mf() function**
+
+Sometimes embedded subheaders can’t be matched by content or context,
+but they share the same formatting in a spreadsheet file.
+
+The `annotate_mf()` function flattens four common approaches to confer
+meaningful formatting to cells and adds this as a character string to
+the target
+variable.
+
+``` r
+example_spreadsheet <- system.file("extdata/dog_test.xlsx", package = "unheadr")
+annotate_mf(example_spreadsheet,orig = Task, new=Task_annotated)
+```
diff --git a/inst/extdata/dog_test.xlsx b/inst/extdata/dog_test.xlsx
diff --git a/man/annotate_mf.Rd b/man/annotate_mf.Rd
diff --git a/man/unbreak_vals.Rd b/man/unbreak_vals.Rd
diff --git a/man/untangle2.Rd b/man/untangle2.Rd
diff --git a/man/unwrap_cols.Rd b/man/unwrap_cols.Rd
diff --git a/vignettes/unheadr-vignette.Rmd b/vignettes/unheadr-vignette.Rmd
@@ -26,11 +26,12 @@ The functions in this package can help us rework data shared by other people fro
 
 ## Further reading
 
-This vignette is a work in progress, for examples and uses for each function: see the function documentation and the following three blog posts:
+For examples and uses for each function: see the function documentation and the following three blog posts:
 
 -[untangle2 function](https://luisdva.github.io/rstats/tidyeval/)
 -[unbreak_vals function](https://luisdva.github.io/rstats/Tidyeval-pdf-hell/)
 -[unwrap_cols function](https://luisdva.github.io/rstats/unbreaking-vals/)
 
-A preprint or publication about data sharing principles and how these functions relate to them is also in preparaion.
+Some of the possible uses of `unheadr` are now described in this publication:
 
+Verde Arregoitia, L. D., Cooper, N., D'Elía, G. (2018). Good practices for sharing analysis-ready data in mammalogy and biodiversity research. _Hystrix, the Italian Journal of Mammalogy_, 29(2), 155-161. [Open Access, DOI 10.4404/hystrix-00133-2018](https://doi.org/10.4404/hystrix-00133-2018)