https://github.com/maelstrom-research/Rmonize/issues/56

#54 #52
maelstrom-research · Mar 19, 2024 · 99565ec · 99565ec
1 parent 07c79c0
commit 99565ec
Show file tree

Hide file tree

Showing 4 changed files with 249 additions and 41 deletions.
diff --git a/R/02-harmo_process_harmonization.R b/R/02-harmo_process_harmonization.R
@@ -33,14 +33,17 @@
 #' To initiate processing, the first entry must be the creation of a harmonized 
 #' primary identifier variable (e.g., participant unique ID).
 #'
-#' @param dossier List of data frame(s) containing input dataset(s).
+#' @param object Data frame(s) or list of data frame(s) containing input 
+#' dataset(s).
 #' @param dataschema A DataSchema object.
 #' @param data_proc_elem A Data Processing Elements object.
 #' @param harmonized_col_dataset A character string identifying the column 
 #' to use for dataset names. NULL by default.
 #' @param harmonized_col_id A character string identifying the name of the 
 #' column present in every dataset to use as a dataset identifier. 
 #' NULL by default.
+#' @param .debug Allow user to test the inputs before processing harmonization.
+#' @param dossier `r lifecycle::badge("deprecated")`
 #'
 #' @returns
 #' A list of data frame(s), containing harmonized dataset(s). The DataSchema 
@@ -81,47 +84,229 @@
 #'
 #' @export
 harmo_process <- function(
-    dossier,
+    object = NULL,
     dataschema = attributes(dossier)$`Rmonize::DataSchema`,
     data_proc_elem = attributes(dossier)$`Rmonize::Data Processing Elements`,
     harmonized_col_dataset = attributes(dossier)$`Rmonize::harmonized_col_dataset`,
-    harmonized_col_id = attributes(dossier)$`Rmonize::harmonized_col_id`
+    harmonized_col_id = attributes(dossier)$`Rmonize::harmonized_col_id`,
+    .debug = FALSE,
+    dossier = object
     ){
 
   # future dev
   # si le vT n'existe pas dans le Data Processing Elements, aller le chercher 
   # dans le DataSchema
   # controle de version ?
 
+  if(.debug == FALSE){
+
+    # check arguments
+    if(is.null(object) | is.null(dataschema) | is.null(data_proc_elem))
+      stop(call. = FALSE,
+"
+
+`object`, `dataschema` and `data_proc_elem` are mandatory and must be provided.
+If you want to allow the code to run, you can specify 
+harmo_process(object, dataset, dataprocelem, .debug = TRUE) to test your
+input elements before processing harmonization.")
+
+  }
+
+  add_index_dossier <- function(dossier){
+
+    dossier <- as_dossier(dossier)
+    dataset_names <- names(dossier)
+
+    col_ids <- 
+      dossier %>% lapply(function(x) col_id(x)) %>% unique
+
+    if(length(col_ids) > 1 | is.null(col_ids[[1]])){
+      for(i in names(dossier)){
+        # stop()}
+        dossier[[i]] <- 
+          dossier[[i]] %>%
+          add_index(name_index = "Rmonize::index",.force = TRUE) %>%
+          mutate(`Rmonize::index` = paste0(i,".",.data$`Rmonize::index`)) %>%
+          as_dataset(col_id = "Rmonize::index")
+      }}
+    return(dossier)
+  }
+
+  data_proc_elem_get <- function(dossier){
+
+    dataschema <- 
+      dossier %>% 
+      lapply(function(x)mutate(x, across(everything(),as.character))) %>%
+      bind_rows() %>%
+      tibble() %>%
+      valueType_self_adjust() %>%
+      data_dict_extract()
+
+    data_proc_elem <-
+      tibble(
+        index = as.integer(),
+        dataschema_variable = as.character(),
+        input_dataset = as.character(),
+        valueType = as.character(),
+        input_variables = as.character(),
+        `Mlstr_harmo::rule_category` = as.character(),
+        `Mlstr_harmo::algorithm` = as.character())
+
+    for(i in names(dossier)){
+      # stop()}
+      data_proc_elem_i <- 
+        dataschema$Variables %>%
+        rename("dataschema_variable" = "name") %>%
+        mutate(
+          input_dataset = i,
+          input_variables = case_when(
+            .data$`dataschema_variable` %in% names(dossier[[i]]) ~ .data$`dataschema_variable`,
+            TRUE ~ "__BLANK__",
+          ),
+          `Mlstr_harmo::rule_category` = case_when(
+            col_id(dossier[[i]]) == .data$dataschema_variable ~ "id_creation",
+            .data$`dataschema_variable` %in% names(dossier[[i]]) ~ "direct_mapping",
+            TRUE  ~ "impossible"
+          ),
+          `Mlstr_harmo::algorithm` = .data$`Mlstr_harmo::rule_category`) %>%
+        add_index(.force = TRUE)  %>%
+        select(-'label')
+
+      data_proc_elem <- bind_rows(data_proc_elem,data_proc_elem_i)
+
+    }
+
+    data_proc_elem <- as_data_proc_elem(data_proc_elem)
+    return(data_proc_elem)
+  }
+
   extract_var <- function(x){
     x <- x %>%
       str_replace_all('"',"`") %>%
       str_replace_all("'","`") %>%
-      str_remove_all("`") 
+      str_remove_all("`") %>%
+      str_squish()
     x = x[!is.na(x)]
 
     return(x)}
 
   # check arguments
-  if(is.null(dataschema) & is.null(data_proc_elem) & !is.null(harmonized_col_id)){
+
+  if(is_data_proc_elem(object)) 
+    stop(call. = FALSE,'
+This object is a Data Processing Elements. 
+Please write harmo_process(data_proc_elem = my_object) instead.')
+
+  if(is_dataschema(object)) 
+    stop(call. = FALSE,'
+This object is a DataSchema. 
+Please write harmo_process(dataschema = my_object) instead.')
+
+  if(is.null(object) & is.null(data_proc_elem) & is.null(dataschema))
+    stop(call. = FALSE,"At least one element is missing.")
+
+  # create dossier from data proc elem.
+
+  if(is.null(object) & !is.null(data_proc_elem)){
 
-    dossier <- 
-      as_harmonized_dossier(
-        dossier,
-        harmonized_col_id = harmonized_col_id,
-        harmonized_col_dataset = harmonized_col_dataset) 
+    data_proc_elem <- as_data_proc_elem(data_proc_elem)
+    name_datasets <- unique(data_proc_elem$input_dataset)
+    object <- lapply(as.list(name_datasets),function(x) tibble())
+    names(object) <- name_datasets
+
+    for(i in name_datasets){
+      # stop()}
+
+      input_vars <- 
+        unique(extract_var(
+          data_proc_elem %>% 
+            dplyr::filter(.data$`input_dataset` == i & 
+                            .data$`input_variables` != '__BLANK__') %>%
+            select("input_variables") %>%
+            separate_longer_delim(cols = "input_variables",";") %>%
+            pull("input_variables")))
+
+      object[[i]] <- 
+        as_tibble(data.frame(matrix(nrow=0,ncol=length(input_vars)))) %>%
+        mutate(across(everything(),as.character))
+
+      names(object[[i]]) <- input_vars
+    }
+
+    # harmonized_col_id <- extract_var(
+    #   data_proc_elem %>%
+    #     dplyr::filter(.data$`Mlstr_harmo::rule_category` == "id_creation") %>%
+    #     pull("dataschema_variable") %>% unique)
 
-    return(harmo_process(dossier))}
+    return(
+      harmo_process(
+        object = object,
+        dataschema = dataschema,
+        data_proc_elem = data_proc_elem,
+        harmonized_col_dataset = harmonized_col_dataset,
+        harmonized_col_id = harmonized_col_id,
+        .debug = .debug))
+
+  }
 
-  dossier <- as_dossier(dossier)
+  if(is.null(object) & is.null(data_proc_elem) & !is.null(dataschema)){
+
+    input_vars <- dataschema$Variables$name
+    object <- as_tibble(data.frame(matrix(nrow=0,ncol=length(input_vars))))
+    names(object) <- input_vars
+
+    object <- valueType_adjust(from = dataschema,to = object)
+
+    return(
+      harmo_process(
+        object,dataschema,
+        data_proc_elem,
+        harmonized_col_dataset,
+        harmonized_col_id,
+        .debug = .debug))
+  }
+
+  if(is_dataset(dossier)) dossier <- list(dossier)
 
-  # check arguments
-  if(is.null(data_proc_elem)) 
-    stop(call. = FALSE, 
-         'The Data Processing Element argument is missing')
+  fargs <- as.list(match.call(expand.dots = TRUE))
+  if(is.null(names(dossier)))
+    if(!is.null(fargs$object)){
+      names(dossier) <- make_name_list(as.character(fargs['object']), dossier)
+    }else{
+      names(dossier) <- make_name_list(as.character(fargs['dossier']), dossier)
+    }
 
-  data_proc_elem <- as_data_proc_elem(data_proc_elem)
+  # check arguments. make sure col_is harmonizable if exists
+  dossier <- dossier_create(dossier)
 
+  if(!is.null(harmonized_col_id)){
+    dossier <- 
+      dossier %>% 
+      lapply(function(x) 
+        x %>% 
+          # mutate(across(
+          # all_of(harmonized_col_id),
+          # ~ as_valueType(.,'text'))) %>%
+          as_dataset(harmonized_col_id)
+    )}
+
+  # check arguments
+  # if(is.null(dataschema) & is.null(data_proc_elem) & !is.null(harmonized_col_id)){
+  #   dossier <- 
+  #     as_harmonized_dossier(
+  #       dossier,
+  #       harmonized_col_id = harmonized_col_id,
+  #       harmonized_col_dataset = harmonized_col_dataset)  
+  # return(harmo_process(dossier))}
+
+  # check arguments
+  if(is.null(data_proc_elem)){
+    dossier <- add_index_dossier(dossier)
+    data_proc_elem <- data_proc_elem_get(dossier)
+  }else{
+    data_proc_elem <- as_data_proc_elem(data_proc_elem)
+  }
+
   if(is.null(dataschema)){
     dataschema <- dataschema_extract(data_proc_elem)
   }else{
@@ -178,7 +363,6 @@ harmo_process <- function(
     as.list
   names(dpe) <- sort(unique(bind_rows(dpe)$input_dataset))
 
-
   vars <- extract_var(unique(data_proc_elem$dataschema_variable))
   dataschema <- 
     data_dict_filter(
@@ -281,6 +465,7 @@ Please correct elements and reprocess.')
 
     harmonized_dossier[[i]] <-
       harmonized_dossier[[i]] %>% bind_rows(harmonization_id)
+
     harmonized_dossier[[i]] <- harmonized_dossier_init[[i]] <-
       as_dataset(
         harmonized_dossier[[i]],
@@ -1085,7 +1270,7 @@ harmo_process_recode <- function(process_rule_slice){
     mutate(
       input_variables  = str_remove_all(.data$`input_variables`,'`'),
       replacement      =
-        .data$`script`,
+        .data$`script` %>% str_squish(),
       replacement      =
         str_replace_all(.data$`replacement`,"\'","{madshapR::apostrophy}"),
       replacement      =
@@ -1229,6 +1414,8 @@ harmo_process_undetermined <- function(process_rule_slice){
 #' Data Processing Elements used in processing as attributes.
 #'
 #' @param harmonized_dossier A list containing the harmonized dataset(s).
+#' @param show_warnings Whether the function should print warnings or not. 
+#' TRUE by default.
 #'
 #' @returns
 #' Nothing to be returned. The function prints messages in the console, 
@@ -1249,7 +1436,7 @@ harmo_process_undetermined <- function(process_rule_slice){
 #' @importFrom utils capture.output
 #'
 #' @export
-show_harmo_error <- function(harmonized_dossier){
+show_harmo_error <- function(harmonized_dossier, show_warnings = TRUE){
 
   # list of primary error in the Data Processing Elements.
   # print the list of error + the index
@@ -1316,12 +1503,21 @@ show_harmo_error <- function(harmonized_dossier){
                    .data$`input_dataset`, " \n"),
       rule = paste0(.data$`Mlstr_harmo::algorithm`))
 
+  if(show_warnings == FALSE){
+    report_log <-
+      report_log %>%
+      dplyr::filter(.data$warning != TRUE)
+  }
+
   if(nrow(report_log) > 0){
     message(bold(
-"\n
-- ERROR/WARNING STATUS DETAILS: -------------------------------------------\n"),
-"\nHere is the list of the errors  and warnings encountered in the process 
-of harmonization:\n")
+paste0("\n
+- ERROR",ifelse(show_warnings == TRUE,"/WARNING",""),
+" STATUS DETAILS: -------------------------------------------\n")),
+paste0("\nHere is the list of the errors",ifelse(show_warnings == TRUE," and warnings",""),
+" encountered in the process 
+of harmonization:\n"))
+
     for(i in seq_len(nrow(report_log))){
       message(
 "---------------------------------------------------------------------------\n")
@@ -1923,7 +2119,7 @@ as_dataschema_mlstr <- function(object){
 #'   
 #' }
 #'
-#' @import dplyr
+#' @import dplyr fabR
 #' @importFrom rlang .data
 #'
 #' @export
@@ -1935,18 +2131,17 @@ as_harmonized_dossier <- function(
     harmonized_col_dataset = attributes(object)$`Rmonize::harmonized_col_dataset`,
     harmonized_data_dict_apply = FALSE){
 
-  # check object
-  as_dossier(object)
+  silently_run(dossier_create(object))
 
   if(!is.logical(harmonized_data_dict_apply))
     stop(call. = FALSE,
-         '`harmonized_data_dict_apply` must be TRUE or F*ALSE (TRUE by default)')
+         '`harmonized_data_dict_apply` must be TRUE or FALSE (TRUE by default)')
 
   # check the id column (mandatory to exist)
   if(is.null(harmonized_col_id)) 
     stop(call. = FALSE,
          '`harmonized_col_id` must be provided')
-  # check if exists
+  # check if col_id exists
   bind_rows(
     object %>% lapply(function(x) x %>% 
                         mutate(across(everything(),as.character)))) %>% 
@@ -2032,17 +2227,19 @@ name list of variables.")
     message(bold("\n
 - CREATION OF HARMONIZED DATA DICTIONARY : --------------------------------\n"))
 
-    harmo_data_dict <- dataschema
-    harmo_data_dict[['Variables']][['Mlstr_harmo::rule_category']] <- NULL
-    harmo_data_dict[['Variables']][['Mlstr_harmo::algorithm']] <- NULL
-    harmo_data_dict[['Variables']][['Rmonize::r_script']] <- NULL
-    harmo_data_dict[['Variables']][['Mlstr_harmo::comment']] <- NULL
-    harmo_data_dict[['Variables']][['Mlstr_harmo::status']] <- NULL
-    harmo_data_dict[['Variables']][['Mlstr_harmo::status_detail']] <- NULL
+
 
     for(i in names(object)){
       # stop()}
 
+      harmo_data_dict <- dataschema
+      harmo_data_dict[['Variables']][['Mlstr_harmo::rule_category']] <- NULL
+      harmo_data_dict[['Variables']][['Mlstr_harmo::algorithm']] <- NULL
+      harmo_data_dict[['Variables']][['Rmonize::r_script']] <- NULL
+      harmo_data_dict[['Variables']][['Mlstr_harmo::comment']] <- NULL
+      harmo_data_dict[['Variables']][['Mlstr_harmo::status']] <- NULL
+      harmo_data_dict[['Variables']][['Mlstr_harmo::status_detail']] <- NULL
+
       input_data_proc_elem <-
         data_proc_elem %>%
         rename("name" = "dataschema_variable") %>%