Skip to content

Commit

Permalink
Merge pull request #4 from matt-dray/md-tables
Browse files Browse the repository at this point in the history
Add a standalone table converter
  • Loading branch information
matt-dray committed Jun 21, 2023
2 parents 2cb60ea + a00c459 commit dabfe16
Show file tree
Hide file tree
Showing 12 changed files with 307 additions and 28 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
^data-raw$
^LICENSE\.md$
^README\.Rmd$
^\.github$
1 change: 1 addition & 0 deletions .github/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.html
49 changes: 49 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]

name: R-CMD-check

jobs:
R-CMD-check:
runs-on: ${{ matrix.config.os }}

name: ${{ matrix.config.os }} (${{ matrix.config.r }})

strategy:
fail-fast: false
matrix:
config:
- {os: macos-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
needs: check

- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Imports:
cli,
clipr,
fs,
xml2
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(table_to_govspeak)
export(wu_body)
export(wu_p)
export(wu_read)
43 changes: 24 additions & 19 deletions R/extract.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,64 +7,69 @@
#'
#' @examples
#' path <- system.file("examples/simple.docx", package = "wordup")
#' body_list <- wu_read(path)
#' p_list <- wu_body(body_list, "p")
#' doc_list <- wu_read(path)
#' p_list <- wu_body(doc_list, "p")
#' str(p_list, give.attr = FALSE, max.level = 1)
#'
#' @export
wu_body <- function(doc_list, element = c("p", "tbl")) {

element <- match.arg(element)

# TODO: input checks
# TODO: should this just become part of the wu_p function?

element <- match.arg(element)
doc_body <- doc_list[["document"]][["body"]]
doc_body[grep(paste0("^", element, "$"), names(doc_body))]

}

#' Extract All 'p' Body Text and Style to a Dataframe
#'
#' @param body_list List. Output from [wu_body].
#' @param p_list List. Output from [wu_body] with argument `element = "p"`.
#'
#' @return A data.frame with a row per 'p' element and columns with text and
#' possibly style information.
#'
#' @examples
#' path <- system.file("examples/simple.docx", package = "wordup")
#' body_list <- wu_read(path)
#' p_list <- wu_body(body_list, "p")
#' head(wu_p(p_list))
#' doc_list <- wu_read(path)
#' p_list <- wu_body(doc_list, "p")
#' wu_p(p_list)
#'
#' @export
wu_p <- function(body_list) {
wu_p <- function(p_list) {

# TODO: input checks
# TODO: should the wu_body element extraction code be integrated here?
# TODO: iterate over all the p elements

p_content <- vector("list", length = length(body_list))
for (i in seq(body_list)) {
p_content <- vector("list", length = length(p_list))

for (i in seq(p_list)) {

p_text <- body_list[[i]][["r"]][["t"]][[1]]
p_style <- attr(body_list[[i]][["pPr"]][["pStyle"]], "val")
p_text <- p_list[[i]][["r"]][["t"]][[1]]
has_p_text <- !is.null(p_text)

p_style <- attr(p_list[[i]][["pPr"]][["pStyle"]], "val")
has_p_style <- !is.null(p_style)

# TODO: hyperlinks and other styling

p_info <- data.frame(text = p_text, style = NA_character_)
p_info <- data.frame(text = NA_character_, style = NA_character_)

if (has_p_text) {
p_info[["text"]] <- p_text
}

if (has_p_style) {
p_info[["style"]] <- p_style
}

p_content[[i]] <- p_info

}

do.call("rbind", p_content)

}






151 changes: 151 additions & 0 deletions R/tables.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#' Convert a Copy-Pasted Word Table to Govspeak
#'
#' Provide a copied table from a Word document and be returned a Govspeak
#' Markdown version of it. Some post-editing may be necessary for more complex
#' tables.
#'
#' @param word_table Character. A table copy-pasted from a Microsoft Word
#' document. If `NULL` (default) the table will be read from the clipboard
#' so that you don't have to paste it.
#' @param guess_types Logical. Should data types be guessed for each column
#' based on their content? Defaults to `TRUE`. If `FALSE`, all columns will
#' be returned as character type.
#' @param ignore_regex Character. A regular expression of strings to ignore
#' when trying to guess column types. See details.
#' @param has_row_titles Logical. Should the first column be treated as though
#' it contains titles for each row? Defaults to `FALSE`. If `TRUE`, the
#' first column will be marked-up as bold.
#' @param totals_rows Integer. A vector of indices to identify rows that
#' contain totals. These will marked up as bold.
#' @param to_clipboard Logical. Should the output be copied to your clipboard?
#' Defaults to `TRUE`.
#'
#' @details
#' If `guess_types` is `TRUE`, then [utils::type.convert()] is used to coerce
#' each column to the appropriate data type. For example, a column containing
#' numbers will be coerced to `numeric`. This will fail if the numbers in a
#' given column are formatted to contain non-numeric characters, like '1,234'
#' (comma) or '10%' (percentage symbol). Use `ignore_regex` so that the process
#' of guessing the data types will ignore these characters.
#'
#' @return Character. A string that contains Govspeak Markdown that represents
#' the copy-pasted table.
#'
#' @examples
#' word_table <- c(
#' "Column 1 Column 2 Column 3 Column 4 Column 5
#' X 100 1,000 1% 15
#' Y 200 2,000 2% 12
#' Z 300 3,000 3% [c]"
#' )
#'
#' table_to_govspeak(word_table, to_clipboard = FALSE)
#'
#' @export
table_to_govspeak <- function(
word_table = NULL,
guess_types = TRUE,
ignore_regex = ",|%|\\[.\\]",
has_row_titles = FALSE,
totals_rows = NULL,
to_clipboard = TRUE
) {

guess_types_is_lgl <- inherits(guess_types, "logical")
has_row_titles_is_lgl <- inherits(has_row_titles, "logical")
clipboard_is_lgl <- inherits(to_clipboard, "logical")

if (!guess_types_is_lgl | !guess_types_is_lgl | !clipboard_is_lgl) {
cli::cli_abort(
c(
"Logical input was expected to arguments 'guess_types', 'has_row_titles' and 'clipboard'.",
"i" = "Provide either TRUE or FALSE to these arguments.",
"x" = "You provided {guess_types} to 'guess_types', {has_row_titles} to 'has_row_titles' and {clipboard} to 'clipboard'."
)
)
}

if (!is.null(totals_rows)) {

totals_rows_is_num <- inherits(totals_rows, "integer")

if (!totals_rows_is_num) {
cli::cli_abort(
c(
"Argument 'totals_rows' must be of class integer",
"i" = "Provide a vector of integers (whole numbers) to this argument, or leave empty.",
"x" = "You provided an object of class {class(totals_rows)} to 'totals_rows'."
)
)
}

}

# TODO: how check word_table input? basic check for \t and \n?

if (is.null(word_table)) {
rows <- clipr::read_clip()
}

if (!is.null(word_table)) {
rows <- strsplit(word_table, "\n")[[1]]
}

cells <- lapply(rows, \(x) trimws(strsplit(x, "\t")[[1]]))
dat <- do.call("rbind", cells[-1]) |> as.data.frame()
names(dat) <- cells[[1]]

if (!guess_types) {
dat <- rbind(rep("-------", length(dat)), dat)
}

if (guess_types) {

are_cols_num <- lapply(dat, \(x) gsub(ignore_regex, "", x)) |>
utils::type.convert(as.is = TRUE) |>
lapply(is.numeric)

for (i in seq_along(are_cols_num)) {

if (are_cols_num[[i]]) {
are_cols_num[[i]] <- "------:"
} else {
are_cols_num[[i]] <- "-------"
}

}

dat <- rbind(are_cols_num, dat)

}

if (!is.null(totals_rows)) {
for (row in totals_rows) {
dat[row + 1, ] <- paste0("**", dat[row + 1, ], "**")
}
}

if (has_row_titles) {
dat[2:nrow(dat), 1] <- paste("#", dat[2:nrow(dat), 1] )
}

# Rearrange into vector for printing and copying
vec <- character(length = nrow(dat) + 1)
for (row in 1:nrow(dat)) {
row_pasted <- paste0("| ", paste0(dat[row, ], collapse = " | "), " |\n")
vec[row + 1] <- row_pasted
}
vec[1] <- paste0("| ", paste0(names(dat), collapse = " | "), " |\n")

# Print to console
cat(vec, sep = "")

# Optionally copy to clipboard
if (to_clipboard) {
clipr::write_clip(vec, return_new = TRUE, breaks = "")
message("The output table has been written to the clipboard.")
}

return(invisible(vec))

}
2 changes: 2 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ knitr::opts_chunk$set(

<!-- badges: start -->
[![Project Status: Concept – Minimal or no implementation has been done yet, or the repository is only intended to be a limited example, demo, or proof-of-concept.](https://www.repostatus.org/badges/latest/concept.svg)](https://www.repostatus.org/#concept)
[![R-CMD-check](https://github.com/matt-dray/wordup/workflows/R-CMD-check/badge.svg)](https://github.com/matt-dray/wordup/actions)
[![Blog post](https://img.shields.io/badge/rostrum.blog-post-008900?labelColor=000000&logo=data%3Aimage%2Fgif%3Bbase64%2CR0lGODlhEAAQAPEAAAAAABWCBAAAAAAAACH5BAlkAAIAIf8LTkVUU0NBUEUyLjADAQAAACwAAAAAEAAQAAAC55QkISIiEoQQQgghRBBCiCAIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAAh%2BQQJZAACACwAAAAAEAAQAAAC55QkIiESIoQQQgghhAhCBCEIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAA7)](https://www.rostrum.blog/2023/06/21/wordup-tables/)
<!-- badges: end -->

## Purpose
Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
yet, or the repository is only intended to be a limited example, demo,
or
proof-of-concept.](https://www.repostatus.org/badges/latest/concept.svg)](https://www.repostatus.org/#concept)
[![R-CMD-check](https://github.com/matt-dray/wordup/workflows/R-CMD-check/badge.svg)](https://github.com/matt-dray/wordup/actions)
[![Blog
post](https://img.shields.io/badge/rostrum.blog-post-008900?labelColor=000000&logo=data%3Aimage%2Fgif%3Bbase64%2CR0lGODlhEAAQAPEAAAAAABWCBAAAAAAAACH5BAlkAAIAIf8LTkVUU0NBUEUyLjADAQAAACwAAAAAEAAQAAAC55QkISIiEoQQQgghRBBCiCAIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAAh%2BQQJZAACACwAAAAAEAAQAAAC55QkIiESIoQQQgghhAhCBCEIgiAIgiAIQiAIgSAIgiAIQiAIgRAEQiAQBAQCgUAQEAQEgYAgIAgIBAKBQBAQCAKBQEAgCAgEAoFAIAgEBAKBIBAQCAQCgUAgEAgCgUBAICAgICAgIBAgEBAgEBAgEBAgECAgICAgECAQIBAQIBAgECAgICAgICAgECAQECAQICAgICAgICAgEBAgEBAgEBAgICAgICAgECAQIBAQIBAgECAgICAgIBAgECAQECAQIBAgICAgIBAgIBAgEBAgECAgECAgICAgICAgECAgECAgQIAAAQIKAAA7)](https://www.rostrum.blog/2023/06/21/wordup-tables/)
<!-- badges: end -->

## Purpose
Expand Down Expand Up @@ -64,8 +67,8 @@ str(body_list, give.attr = FALSE, max.level = 3)
# .. ..$ tbl :List of 5
# .. ..$ p :List of 3
# .. ..$ p :List of 4
# .. ..$ p :List of 4
# .. ..$ p : list()
# .. ..$ p :List of 1
# .. ..$ p :List of 1
# .. ..$ sectPr:List of 4
```

Expand Down
Loading

0 comments on commit dabfe16

Please sign in to comment.