Skip to content

Commit

Permalink
https://github.com/maelstrom-research/Rmonize/issues/56
Browse files Browse the repository at this point in the history
  • Loading branch information
GuiFabre committed Mar 19, 2024
1 parent 07c79c0 commit 99565ec
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 41 deletions.
269 changes: 233 additions & 36 deletions R/02-harmo_process_harmonization.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@
#' To initiate processing, the first entry must be the creation of a harmonized
#' primary identifier variable (e.g., participant unique ID).
#'
#' @param dossier List of data frame(s) containing input dataset(s).
#' @param object Data frame(s) or list of data frame(s) containing input
#' dataset(s).
#' @param dataschema A DataSchema object.
#' @param data_proc_elem A Data Processing Elements object.
#' @param harmonized_col_dataset A character string identifying the column
#' to use for dataset names. NULL by default.
#' @param harmonized_col_id A character string identifying the name of the
#' column present in every dataset to use as a dataset identifier.
#' NULL by default.
#' @param .debug Allow user to test the inputs before processing harmonization.
#' @param dossier `r lifecycle::badge("deprecated")`
#'
#' @returns
#' A list of data frame(s), containing harmonized dataset(s). The DataSchema
Expand Down Expand Up @@ -81,47 +84,229 @@
#'
#' @export
harmo_process <- function(
dossier,
object = NULL,
dataschema = attributes(dossier)$`Rmonize::DataSchema`,
data_proc_elem = attributes(dossier)$`Rmonize::Data Processing Elements`,
harmonized_col_dataset = attributes(dossier)$`Rmonize::harmonized_col_dataset`,
harmonized_col_id = attributes(dossier)$`Rmonize::harmonized_col_id`
harmonized_col_id = attributes(dossier)$`Rmonize::harmonized_col_id`,
.debug = FALSE,
dossier = object
){

# future dev
# si le vT n'existe pas dans le Data Processing Elements, aller le chercher
# dans le DataSchema
# controle de version ?

if(.debug == FALSE){

# check arguments
if(is.null(object) | is.null(dataschema) | is.null(data_proc_elem))
stop(call. = FALSE,
"
`object`, `dataschema` and `data_proc_elem` are mandatory and must be provided.
If you want to allow the code to run, you can specify
harmo_process(object, dataset, dataprocelem, .debug = TRUE) to test your
input elements before processing harmonization.")

}

add_index_dossier <- function(dossier){

dossier <- as_dossier(dossier)
dataset_names <- names(dossier)

col_ids <-
dossier %>% lapply(function(x) col_id(x)) %>% unique

if(length(col_ids) > 1 | is.null(col_ids[[1]])){
for(i in names(dossier)){
# stop()}
dossier[[i]] <-
dossier[[i]] %>%
add_index(name_index = "Rmonize::index",.force = TRUE) %>%
mutate(`Rmonize::index` = paste0(i,".",.data$`Rmonize::index`)) %>%
as_dataset(col_id = "Rmonize::index")
}}
return(dossier)
}

data_proc_elem_get <- function(dossier){

dataschema <-
dossier %>%
lapply(function(x)mutate(x, across(everything(),as.character))) %>%
bind_rows() %>%
tibble() %>%
valueType_self_adjust() %>%
data_dict_extract()

data_proc_elem <-
tibble(
index = as.integer(),
dataschema_variable = as.character(),
input_dataset = as.character(),
valueType = as.character(),
input_variables = as.character(),
`Mlstr_harmo::rule_category` = as.character(),
`Mlstr_harmo::algorithm` = as.character())

for(i in names(dossier)){
# stop()}
data_proc_elem_i <-
dataschema$Variables %>%
rename("dataschema_variable" = "name") %>%
mutate(
input_dataset = i,
input_variables = case_when(
.data$`dataschema_variable` %in% names(dossier[[i]]) ~ .data$`dataschema_variable`,
TRUE ~ "__BLANK__",
),
`Mlstr_harmo::rule_category` = case_when(
col_id(dossier[[i]]) == .data$dataschema_variable ~ "id_creation",
.data$`dataschema_variable` %in% names(dossier[[i]]) ~ "direct_mapping",
TRUE ~ "impossible"
),
`Mlstr_harmo::algorithm` = .data$`Mlstr_harmo::rule_category`) %>%
add_index(.force = TRUE) %>%
select(-'label')

data_proc_elem <- bind_rows(data_proc_elem,data_proc_elem_i)

}

data_proc_elem <- as_data_proc_elem(data_proc_elem)
return(data_proc_elem)
}

extract_var <- function(x){
x <- x %>%
str_replace_all('"',"`") %>%
str_replace_all("'","`") %>%
str_remove_all("`")
str_remove_all("`") %>%
str_squish()
x = x[!is.na(x)]

return(x)}

# check arguments
if(is.null(dataschema) & is.null(data_proc_elem) & !is.null(harmonized_col_id)){

if(is_data_proc_elem(object))
stop(call. = FALSE,'
This object is a Data Processing Elements.
Please write harmo_process(data_proc_elem = my_object) instead.')

if(is_dataschema(object))
stop(call. = FALSE,'
This object is a DataSchema.
Please write harmo_process(dataschema = my_object) instead.')

if(is.null(object) & is.null(data_proc_elem) & is.null(dataschema))
stop(call. = FALSE,"At least one element is missing.")

# create dossier from data proc elem.

if(is.null(object) & !is.null(data_proc_elem)){

dossier <-
as_harmonized_dossier(
dossier,
harmonized_col_id = harmonized_col_id,
harmonized_col_dataset = harmonized_col_dataset)
data_proc_elem <- as_data_proc_elem(data_proc_elem)
name_datasets <- unique(data_proc_elem$input_dataset)
object <- lapply(as.list(name_datasets),function(x) tibble())
names(object) <- name_datasets

for(i in name_datasets){
# stop()}

input_vars <-
unique(extract_var(
data_proc_elem %>%
dplyr::filter(.data$`input_dataset` == i &
.data$`input_variables` != '__BLANK__') %>%
select("input_variables") %>%
separate_longer_delim(cols = "input_variables",";") %>%
pull("input_variables")))

object[[i]] <-
as_tibble(data.frame(matrix(nrow=0,ncol=length(input_vars)))) %>%
mutate(across(everything(),as.character))

names(object[[i]]) <- input_vars
}

# harmonized_col_id <- extract_var(
# data_proc_elem %>%
# dplyr::filter(.data$`Mlstr_harmo::rule_category` == "id_creation") %>%
# pull("dataschema_variable") %>% unique)

return(harmo_process(dossier))}
return(
harmo_process(
object = object,
dataschema = dataschema,
data_proc_elem = data_proc_elem,
harmonized_col_dataset = harmonized_col_dataset,
harmonized_col_id = harmonized_col_id,
.debug = .debug))

}

dossier <- as_dossier(dossier)
if(is.null(object) & is.null(data_proc_elem) & !is.null(dataschema)){

input_vars <- dataschema$Variables$name
object <- as_tibble(data.frame(matrix(nrow=0,ncol=length(input_vars))))
names(object) <- input_vars

object <- valueType_adjust(from = dataschema,to = object)

return(
harmo_process(
object,dataschema,
data_proc_elem,
harmonized_col_dataset,
harmonized_col_id,
.debug = .debug))
}

if(is_dataset(dossier)) dossier <- list(dossier)

# check arguments
if(is.null(data_proc_elem))
stop(call. = FALSE,
'The Data Processing Element argument is missing')
fargs <- as.list(match.call(expand.dots = TRUE))
if(is.null(names(dossier)))
if(!is.null(fargs$object)){
names(dossier) <- make_name_list(as.character(fargs['object']), dossier)
}else{
names(dossier) <- make_name_list(as.character(fargs['dossier']), dossier)
}

data_proc_elem <- as_data_proc_elem(data_proc_elem)
# check arguments. make sure col_is harmonizable if exists
dossier <- dossier_create(dossier)

if(!is.null(harmonized_col_id)){
dossier <-
dossier %>%
lapply(function(x)
x %>%
# mutate(across(
# all_of(harmonized_col_id),
# ~ as_valueType(.,'text'))) %>%
as_dataset(harmonized_col_id)
)}

# check arguments
# if(is.null(dataschema) & is.null(data_proc_elem) & !is.null(harmonized_col_id)){
# dossier <-
# as_harmonized_dossier(
# dossier,
# harmonized_col_id = harmonized_col_id,
# harmonized_col_dataset = harmonized_col_dataset)
# return(harmo_process(dossier))}

# check arguments
if(is.null(data_proc_elem)){
dossier <- add_index_dossier(dossier)
data_proc_elem <- data_proc_elem_get(dossier)
}else{
data_proc_elem <- as_data_proc_elem(data_proc_elem)
}

if(is.null(dataschema)){
dataschema <- dataschema_extract(data_proc_elem)
}else{
Expand Down Expand Up @@ -178,7 +363,6 @@ harmo_process <- function(
as.list
names(dpe) <- sort(unique(bind_rows(dpe)$input_dataset))


vars <- extract_var(unique(data_proc_elem$dataschema_variable))
dataschema <-
data_dict_filter(
Expand Down Expand Up @@ -281,6 +465,7 @@ Please correct elements and reprocess.')

harmonized_dossier[[i]] <-
harmonized_dossier[[i]] %>% bind_rows(harmonization_id)

harmonized_dossier[[i]] <- harmonized_dossier_init[[i]] <-
as_dataset(
harmonized_dossier[[i]],
Expand Down Expand Up @@ -1085,7 +1270,7 @@ harmo_process_recode <- function(process_rule_slice){
mutate(
input_variables = str_remove_all(.data$`input_variables`,'`'),
replacement =
.data$`script`,
.data$`script` %>% str_squish(),
replacement =
str_replace_all(.data$`replacement`,"\'","{madshapR::apostrophy}"),
replacement =
Expand Down Expand Up @@ -1229,6 +1414,8 @@ harmo_process_undetermined <- function(process_rule_slice){
#' Data Processing Elements used in processing as attributes.
#'
#' @param harmonized_dossier A list containing the harmonized dataset(s).
#' @param show_warnings Whether the function should print warnings or not.
#' TRUE by default.
#'
#' @returns
#' Nothing to be returned. The function prints messages in the console,
Expand All @@ -1249,7 +1436,7 @@ harmo_process_undetermined <- function(process_rule_slice){
#' @importFrom utils capture.output
#'
#' @export
show_harmo_error <- function(harmonized_dossier){
show_harmo_error <- function(harmonized_dossier, show_warnings = TRUE){

# list of primary error in the Data Processing Elements.
# print the list of error + the index
Expand Down Expand Up @@ -1316,12 +1503,21 @@ show_harmo_error <- function(harmonized_dossier){
.data$`input_dataset`, " \n"),
rule = paste0(.data$`Mlstr_harmo::algorithm`))

if(show_warnings == FALSE){
report_log <-
report_log %>%
dplyr::filter(.data$warning != TRUE)
}

if(nrow(report_log) > 0){
message(bold(
"\n
- ERROR/WARNING STATUS DETAILS: -------------------------------------------\n"),
"\nHere is the list of the errors and warnings encountered in the process
of harmonization:\n")
paste0("\n
- ERROR",ifelse(show_warnings == TRUE,"/WARNING",""),
" STATUS DETAILS: -------------------------------------------\n")),
paste0("\nHere is the list of the errors",ifelse(show_warnings == TRUE," and warnings",""),
" encountered in the process
of harmonization:\n"))

for(i in seq_len(nrow(report_log))){
message(
"---------------------------------------------------------------------------\n")
Expand Down Expand Up @@ -1923,7 +2119,7 @@ as_dataschema_mlstr <- function(object){
#'
#' }
#'
#' @import dplyr
#' @import dplyr fabR
#' @importFrom rlang .data
#'
#' @export
Expand All @@ -1935,18 +2131,17 @@ as_harmonized_dossier <- function(
harmonized_col_dataset = attributes(object)$`Rmonize::harmonized_col_dataset`,
harmonized_data_dict_apply = FALSE){

# check object
as_dossier(object)
silently_run(dossier_create(object))

if(!is.logical(harmonized_data_dict_apply))
stop(call. = FALSE,
'`harmonized_data_dict_apply` must be TRUE or F*ALSE (TRUE by default)')
'`harmonized_data_dict_apply` must be TRUE or FALSE (TRUE by default)')

# check the id column (mandatory to exist)
if(is.null(harmonized_col_id))
stop(call. = FALSE,
'`harmonized_col_id` must be provided')
# check if exists
# check if col_id exists
bind_rows(
object %>% lapply(function(x) x %>%
mutate(across(everything(),as.character)))) %>%
Expand Down Expand Up @@ -2032,17 +2227,19 @@ name list of variables.")
message(bold("\n
- CREATION OF HARMONIZED DATA DICTIONARY : --------------------------------\n"))

harmo_data_dict <- dataschema
harmo_data_dict[['Variables']][['Mlstr_harmo::rule_category']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::algorithm']] <- NULL
harmo_data_dict[['Variables']][['Rmonize::r_script']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::comment']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::status']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::status_detail']] <- NULL


for(i in names(object)){
# stop()}

harmo_data_dict <- dataschema
harmo_data_dict[['Variables']][['Mlstr_harmo::rule_category']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::algorithm']] <- NULL
harmo_data_dict[['Variables']][['Rmonize::r_script']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::comment']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::status']] <- NULL
harmo_data_dict[['Variables']][['Mlstr_harmo::status_detail']] <- NULL

input_data_proc_elem <-
data_proc_elem %>%
rename("name" = "dataschema_variable") %>%
Expand Down
Loading

0 comments on commit 99565ec

Please sign in to comment.