Merge pull request #4 from matt-dray/md-tables

Add a standalone table converter
matt-dray · Jun 21, 2023 · dabfe16 · dabfe16
2 parents 2cb60ea + a00c459
commit dabfe16
Show file tree

Hide file tree

Showing 12 changed files with 307 additions and 28 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,4 @@
 ^data-raw$
 ^LICENSE\.md$
 ^README\.Rmd$
+^\.github$
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -0,0 +1,49 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macos-latest,   r: 'release'}
+          - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+          - {os: ubuntu-latest,   r: 'oldrel-1'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          upload-snapshots: true
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,5 +12,6 @@ Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.3
 Imports: 
     cli,
+    clipr,
     fs,
     xml2
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(table_to_govspeak)
 export(wu_body)
 export(wu_p)
 export(wu_read)
diff --git a/R/extract.R b/R/extract.R
@@ -7,64 +7,69 @@
 #'
 #' @examples
 #' path <- system.file("examples/simple.docx", package = "wordup")
-#' body_list <- wu_read(path)
-#' p_list <- wu_body(body_list, "p")
+#' doc_list <- wu_read(path)
+#' p_list <- wu_body(doc_list, "p")
 #' str(p_list, give.attr = FALSE, max.level = 1)
 #'
 #' @export
 wu_body <- function(doc_list, element = c("p", "tbl")) {
 
+  element <- match.arg(element)
+
   # TODO: input checks
   # TODO: should this just become part of the wu_p function?
 
-  element <- match.arg(element)
   doc_body <- doc_list[["document"]][["body"]]
   doc_body[grep(paste0("^", element, "$"), names(doc_body))]
 
 }
 
 #' Extract All 'p' Body Text and Style to a Dataframe
 #'
-#' @param body_list List. Output from [wu_body].
+#' @param p_list List. Output from [wu_body] with argument `element = "p"`.
 #'
 #' @return A data.frame with a row per 'p' element and columns with text and
 #'     possibly style information.
 #'
 #' @examples
 #' path <- system.file("examples/simple.docx", package = "wordup")
-#' body_list <- wu_read(path)
-#' p_list <- wu_body(body_list, "p")
-#' head(wu_p(p_list))
+#' doc_list <- wu_read(path)
+#' p_list <- wu_body(doc_list, "p")
+#' wu_p(p_list)
 #'
 #' @export
-wu_p <- function(body_list) {
+wu_p <- function(p_list) {
 
   # TODO: input checks
   # TODO: should the wu_body element extraction code be integrated here?
   # TODO: iterate over all the p elements
 
-  p_content <- vector("list", length = length(body_list))
-  for (i in seq(body_list)) {
+  p_content <- vector("list", length = length(p_list))
+
+  for (i in seq(p_list)) {
 
-    p_text  <- body_list[[i]][["r"]][["t"]][[1]]
-    p_style <- attr(body_list[[i]][["pPr"]][["pStyle"]], "val")
+    p_text  <- p_list[[i]][["r"]][["t"]][[1]]
+    has_p_text <- !is.null(p_text)
+
+    p_style <- attr(p_list[[i]][["pPr"]][["pStyle"]], "val")
     has_p_style <- !is.null(p_style)
+
     # TODO: hyperlinks and other styling
 
-    p_info <- data.frame(text = p_text, style = NA_character_)
+    p_info <- data.frame(text = NA_character_, style = NA_character_)
+
+    if (has_p_text) {
+      p_info[["text"]] <- p_text
+    }
+
     if (has_p_style) {
       p_info[["style"]] <- p_style
     }
+
     p_content[[i]] <- p_info
 
   }
 
   do.call("rbind", p_content)
 
 }
-
-
-
-
-
-
diff --git a/R/tables.R b/R/tables.R
@@ -0,0 +1,151 @@
+#' Convert a Copy-Pasted Word Table to Govspeak
+#'
+#' Provide a copied table from a Word document and be returned a Govspeak
+#' Markdown version of it. Some post-editing may be necessary for more complex
+#' tables.
+#'
+#' @param word_table Character. A table copy-pasted from a Microsoft Word
+#'     document. If `NULL` (default) the table will be read from the clipboard
+#'     so that you don't have to paste it.
+#' @param guess_types Logical. Should data types be guessed for each column
+#'     based on their content? Defaults to `TRUE`. If `FALSE`, all columns will
+#'     be returned as character type.
+#' @param ignore_regex Character. A regular expression of strings to ignore
+#'     when trying to guess column types. See details.
+#' @param has_row_titles Logical. Should the first column be treated as though
+#'     it contains titles for each row? Defaults to `FALSE`. If `TRUE`, the
+#'     first column will be marked-up as bold.
+#' @param totals_rows Integer. A vector of indices to identify rows that
+#'     contain totals. These will marked up as bold.
+#' @param to_clipboard Logical. Should the output be copied to your clipboard?
+#'     Defaults to `TRUE`.
+#'
+#' @details
+#' If `guess_types` is `TRUE`, then [utils::type.convert()] is used to coerce
+#' each column to the appropriate data type. For example, a column containing
+#' numbers will be coerced to `numeric`. This will fail if the numbers in a
+#' given column are formatted to contain non-numeric characters, like '1,234'
+#' (comma) or '10%' (percentage symbol). Use `ignore_regex` so that the process
+#' of guessing the data types will ignore these characters.
+#'
+#' @return Character. A string that contains Govspeak Markdown that represents
+#'     the copy-pasted table.
+#'
+#' @examples
+#' word_table <- c(
+#'   "Column 1	Column 2	Column 3	Column 4	Column 5
+#'   X	100	1,000	1%	15
+#'   Y	200	2,000	2%	12
+#'   Z	300	3,000	3%	[c]"
+#' )
+#'
+#' table_to_govspeak(word_table, to_clipboard = FALSE)
+#'
+#' @export
+table_to_govspeak <- function(
+    word_table = NULL,
+    guess_types = TRUE,
+    ignore_regex = ",|%|\\[.\\]",
+    has_row_titles = FALSE,
+    totals_rows = NULL,
+    to_clipboard = TRUE
+) {
+
+  guess_types_is_lgl <- inherits(guess_types, "logical")
+  has_row_titles_is_lgl <- inherits(has_row_titles, "logical")
+  clipboard_is_lgl <- inherits(to_clipboard, "logical")
+
+  if (!guess_types_is_lgl | !guess_types_is_lgl | !clipboard_is_lgl) {
+    cli::cli_abort(
+      c(
+        "Logical input was expected to arguments 'guess_types', 'has_row_titles' and 'clipboard'.",
+        "i" = "Provide either TRUE or FALSE to these arguments.",
+        "x" = "You provided {guess_types} to 'guess_types', {has_row_titles} to 'has_row_titles' and {clipboard} to 'clipboard'."
+      )
+    )
+  }
+
+  if (!is.null(totals_rows)) {
+
+    totals_rows_is_num <- inherits(totals_rows, "integer")
+
+    if (!totals_rows_is_num) {
+      cli::cli_abort(
+        c(
+          "Argument 'totals_rows' must be of class integer",
+          "i" = "Provide a vector of integers (whole numbers) to this argument, or leave empty.",
+          "x" = "You provided an object of class {class(totals_rows)} to 'totals_rows'."
+        )
+      )
+    }
+
+  }
+
+  # TODO: how check word_table input? basic check for \t and \n?
+
+  if (is.null(word_table)) {
+    rows <- clipr::read_clip()
+  }
+
+  if (!is.null(word_table)) {
+    rows <- strsplit(word_table, "\n")[[1]]
+  }
+
+  cells <- lapply(rows, \(x) trimws(strsplit(x, "\t")[[1]]))
+  dat <- do.call("rbind", cells[-1]) |> as.data.frame()
+  names(dat) <- cells[[1]]
+
+  if (!guess_types) {
+    dat <- rbind(rep("-------", length(dat)), dat)
+  }
+
+  if (guess_types) {
+
+    are_cols_num <- lapply(dat, \(x) gsub(ignore_regex, "", x)) |>
+      utils::type.convert(as.is = TRUE) |>
+      lapply(is.numeric)
+
+    for (i in seq_along(are_cols_num)) {
+
+      if (are_cols_num[[i]]) {
+        are_cols_num[[i]] <- "------:"
+      } else {
+        are_cols_num[[i]] <- "-------"
+      }
+
+    }
+
+    dat <- rbind(are_cols_num, dat)
+
+  }
+
+  if (!is.null(totals_rows)) {
+    for (row in totals_rows) {
+      dat[row + 1, ] <- paste0("**", dat[row + 1, ], "**")
+    }
+  }
+
+  if (has_row_titles) {
+    dat[2:nrow(dat), 1] <- paste("#", dat[2:nrow(dat), 1] )
+  }
+
+  # Rearrange into vector for printing and copying
+  vec <- character(length = nrow(dat) + 1)
+  for (row in 1:nrow(dat)) {
+    row_pasted <- paste0("| ", paste0(dat[row, ], collapse = " | "), " |\n")
+    vec[row + 1] <- row_pasted
+  }
+  vec[1] <- paste0("| ", paste0(names(dat), collapse = " | "), " |\n")
+
+  # Print to console
+  cat(vec, sep = "")
+
+  # Optionally copy to clipboard
+  if (to_clipboard) {
+    clipr::write_clip(vec, return_new = TRUE, breaks = "")
+    message("The output table has been written to the clipboard.")
+  }
+
+  return(invisible(vec))
+
+}
diff --git a/README.Rmd b/README.Rmd
@@ -17,6 +17,8 @@ knitr::opts_chunk$set(
 
 <!-- badges: start -->
 [![Project Status: Concept – Minimal or no implementation has been done yet, or the repository is only intended to be a limited example, demo, or proof-of-concept.](https://www.repostatus.org/badges/latest/concept.svg)](https://www.repostatus.org/#concept)
+[![R-CMD-check](https://github.com/matt-dray/wordup/workflows/R-CMD-check/badge.svg)](https://github.com/matt-dray/wordup/actions)
+[![Blog post](https://img.shields.io/badge/rostrum.blog-post-008900?labelColor=000000&logo=data%3Aimage%2Fgif%3Bbase64%2CR0lGODlhEAAQAPEAAAAAABWCBAAAAAAAACH5BAlkAAIAIf8LTkVUU0NBUEUyLjADAQAAACwAAAAAEAAQAAAC55QkISIiEoQQQgghRBBCiCAIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAAh%2BQQJZAACACwAAAAAEAAQAAAC55QkIiESIoQQQgghhAhCBCEIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAA7)](https://www.rostrum.blog/2023/06/21/wordup-tables/)
 <!-- badges: end -->
 
 ## Purpose

diff --git a/README.md b/README.md
@@ -9,6 +9,9 @@
 yet, or the repository is only intended to be a limited example, demo,
 or
 proof-of-concept.](https://www.repostatus.org/badges/latest/concept.svg)](https://www.repostatus.org/#concept)
+[![R-CMD-check](https://github.com/matt-dray/wordup/workflows/R-CMD-check/badge.svg)](https://github.com/matt-dray/wordup/actions)
+[![Blog
+post](https://img.shields.io/badge/rostrum.blog-post-008900?labelColor=000000&logo=data%3Aimage%2Fgif%3Bbase64%2CR0lGODlhEAAQAPEAAAAAABWCBAAAAAAAACH5BAlkAAIAIf8LTkVUU0NBUEUyLjADAQAAACwAAAAAEAAQAAAC55QkISIiEoQQQgghRBBCiCAIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAAh%2BQQJZAACACwAAAAAEAAQAAAC55QkIiESIoQQQgghhAhCBCEIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAA7)](https://www.rostrum.blog/2023/06/21/wordup-tables/)
 <!-- badges: end -->
 
 ## Purpose
@@ -64,8 +67,8 @@ str(body_list, give.attr = FALSE, max.level = 3)
 #   .. ..$ tbl   :List of 5
 #   .. ..$ p     :List of 3
 #   .. ..$ p     :List of 4
-#   .. ..$ p     :List of 4
-#   .. ..$ p     : list()
+#   .. ..$ p     :List of 1
+#   .. ..$ p     :List of 1
 #   .. ..$ sectPr:List of 4
 ```