-
Notifications
You must be signed in to change notification settings - Fork 0
/
03-harmonized_data_evaluate.R
406 lines (374 loc) · 14.7 KB
/
03-harmonized_data_evaluate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
#' @title
#' Generate an assessment report for a harmonized dossier
#'
#' @description
#' Assesses the content and structure of a harmonized dossier and generates
#' reports of the results. This function can be used to evaluate data structure,
#' presence of specific fields, coherence across elements, and data dictionary
#' formats.
#'
#' @details
#' A harmonized dossier is a named list containing one or more data frames,
#' which are harmonized datasets. A harmonized dossier is generally the
#' product of applying processing to a dossier object The name of each
#' harmonized dataset (data frame) is taken from the reference input dataset.
#' A harmonized dossier also contains the DataSchema and
#' Data Processing Elements used in processing as attributes.
#'
#' A DataSchema is the list of core variables to generate across datasets and
#' related metadata. A DataSchema object is a list of data frames with elements
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables'
#' element must contain at least the `name` column, and the 'Categories'
#' element must contain at least the `variable` and `name` columns to be usable
#' in any function. In 'Variables' the `name` column must also have unique
#' entries, and in 'Categories' the combination of `variable` and `name` columns
#' must also be unique.
#'
#' A taxonomy is a classification schema that can be defined for variable
#' attributes. A taxonomy is usually extracted from an
#' [Opal environment](https://www.obiba.org/pages/products/opal//), and a
#' taxonomy object is a data frame that must contain at least the columns
#' `taxonomy`, `vocabulary`, and `terms`. Additional details about Opal
#' taxonomies are
#' [available online](https://opaldoc.obiba.org/en/latest/web-user-guide/administration/taxonomies.html).
#'
#' The object may be specifically formatted to be compatible with additional
#' [Maelstrom Research software](https://maelstrom-research.org/page/software),
#' in particular [Opal environments](https://www.obiba.org/pages/products/opal/).
#'
#' @param harmonized_dossier A list containing the harmonized dataset(s).
#' @param taxonomy An optional data frame identifying a variable classification
#' schema.
#' @param dataschema A DataSchema object.
#' @param as_dataschema_mlstr Whether the output DataSchema should be coerced
#' with specific format restrictions for compatibility with other
#' Maelstrom Research software. TRUE by default.
#'
#' @returns
#' A list of data frames containing assessment reports for each harmonized dataset.
#'
#' @examples
#' {
#'
#' #' # use Rmonize_DEMO provided by the package
#' library(dplyr)
#'
#' glimpse(harmonized_dossier_evaluate(Rmonize_DEMO$harmonized_dossier))
#'
#' }
#'
#' @import dplyr stringr tidyr haven
#' @importFrom crayon bold
#' @importFrom rlang .data
#'
#' @export
harmonized_dossier_evaluate <- function(
harmonized_dossier,
dataschema = attributes(harmonized_dossier)$`Rmonize::DataSchema`,
taxonomy = NULL,
as_dataschema_mlstr = TRUE){
# future dev
# assess harmonized data dictionary
# exclude impossible from the evaluation
# tests
if(!is.null(taxonomy)) as_taxonomy(taxonomy)
if(!is.logical(as_dataschema_mlstr))
stop(call. = FALSE,
'`as_dataschema_mlstr` must be TRUE or FALSE (FALSE by default)')
# creation of pooled_harmonized_dataset
pooled_harmonized_dataset <-
pooled_harmonized_dataset_create(harmonized_dossier)
report_list <-
dataset_evaluate(
dataset = pooled_harmonized_dataset,
data_dict = dataschema,
taxonomy = taxonomy,
as_data_dict_mlstr = as_dataschema_mlstr)
report_list <-
report_list %>%
lapply(function(x){
names(x) <- str_replace(names(x),"Data dictionary summary",
"Harmonized Data dictionary summary")
names(x) <- str_replace(names(x),"Data dictionary assessment",
"Harmonized Data dictionary assessement")
names(x) <- str_replace(names(x),"Dataset assessment",
"Harmonized Dataset assessment")
return(x)
})
return(report_list)
}
#' @title
#' Generate an assessment report for Data Processing Elements
#'
#' @description
#' `r lifecycle::badge("experimental")`
#' Assesses the content and structure of a Data Processing Elements object and
#' generates reports of the results. This function can be used to evaluate data
#' structure, presence of specific fields, coherence across elements, and data
#' dictionary formats.
#'
#' @details
#' The Data Processing Elements specifies the algorithms used to process input
#' variables into harmonized variables in the DataSchema format. It is also
#' contains metadata used to generate documentation of the processing.
#' A Data Processing Elements object is a data frame with specific columns
#' used in data processing: `dataschema_variable`, `input_dataset`,
#' `input_variables`, `Mlstr_harmo::rule_category` and `Mlstr_harmo::algorithm`.
#' To initiate processing, the first entry must be the creation of a harmonized
#' primary identifier variable (e.g., participant unique ID).
#'
#' A taxonomy is a classification schema that can be defined for variable
#' attributes. A taxonomy is usually extracted from an
#' [Opal environment](https://www.obiba.org/pages/products/opal//), and a
#' taxonomy object is a data frame that must contain at least the columns
#' `taxonomy`, `vocabulary`, and `terms`. Additional details about Opal
#' taxonomies are
#' [available online](https://opaldoc.obiba.org/en/latest/web-user-guide/administration/taxonomies.html).
#'
#' @param data_proc_elem A Data Processing Elements object.
#' @param taxonomy An optional data frame identifying a variable classification
#' schema.
#'
#' @returns
#' A list of data frames containing assessment reports.
#'
#' @examples
#' {
#'
#' # use Rmonize_DEMO provided by the package
#'
#' data_proc_elem <- Rmonize_DEMO$`data_processing_elements - final`
#' data_proc_elem_evaluate(data_proc_elem)
#'
#' }
#'
#' @import dplyr fabR
#' @importFrom rlang .data
#' @importFrom crayon bold
#'
#' @noRd
data_proc_elem_evaluate <- function(data_proc_elem, taxonomy = NULL){
data_proc_elem <-
as_data_proc_elem(data_proc_elem) %>%
add_index("Row number", .force = TRUE)
if(!is.null(taxonomy)) as_taxonomy(taxonomy)
message(
"- DATA PROCESSING ASSESSMENT ------------------------------------------------")
# creation of the structure of the report
report <- list()
report$`Data Processing Elements summary` <- data_proc_elem
test_names_harmo_rule <-
test_duplicated_rule <-
test_possible_ruling <-
# ...
tibble("Row number" = as.integer())
message(" Assess the rule category declared")
test_names_harmo_rule <-
data_proc_elem %>%
mutate(
value =
ifelse(
.data$`Mlstr_harmo::rule_category` %in% c(
"add_variable",
"case_when",
"direct_mapping",
"id_creation",
"impossible",
"merge_variable",
"operation",
"other",
"paste",
"recode",
"rename",
"undetermined"),NA_character_,.data$`Mlstr_harmo::rule_category`)) %>%
filter(!is.na(.data$`value`)) %>%
mutate(condition = "[ERR] - Rule category name doesn't exist") %>%
select("Row number","value","condition")
report$`Data Processing Elements assessment` <-
test_names_harmo_rule %>%
bind_rows(test_duplicated_rule) %>%
bind_rows(test_possible_ruling) %>%
select("Row number", matches("value"), matches("condition")) %>%
arrange(.data$`Row number`) %>%
mutate(across(everything(), ~ as.character(.))) %>%
distinct() %>% tibble
message(" Generate report")
if(nrow(report$`Data Processing Elements assessment`) == 0){
message("\n The Data Processing Elements contains no error/warning.")
report$`Data Processing Elements assessment` <- NULL
}
message(bold(
"
- WARNING MESSAGES (if any): --------------------------------------------\n"))
return(report)
# futur dev
# dossier_name <- tibble(dossier = as.character(), dataset = as.character())
# for(i in names(dossier)) for(j in names(dossier[[i]])){
# dossier_name <- dossier_name %>% add_row(dossier = i, dataset = j)}
#
# dpe_name <- tibble(dossier = as.character(), dataset = as.character())
# for(i in names(data_proc_elem)) for(j in names(data_proc_elem[[i]])){
# dpe_name <- dpe_name %>% add_row(dossier = i, dataset = j)}
#
# no_dpe <-
# anti_join(dossier_name, dpe_name,by = c("dossier", "dataset")) %>%
# group_by(.data$`dossier`) %>%
# summarise(dataset = paste(dataset, collapse = " - ")) %>%
# ungroup() %>%
# unite("value", .data$`dossier`, .data$`dataset`, sep = " : ") %>%
# summarise(value = paste(.data$`value`, collapse = " \n")) %>% pull
#
# no_dossier <-
# anti_join(dpe_name, dossier_name,by = c("dossier", "dataset")) %>%
# group_by(.data$`dossier`) %>%
# summarise(dataset = paste(dataset, collapse = " - ")) %>%
# ungroup() %>%
# unite("value", .data$`dossier`, .data$`dataset`, sep = " : ") %>%
# summarise(value = paste(.data$`value`, collapse = " \n")) %>% pull
#
# if(inner_join(dossier_name, dpe_name,by = c("dossier", "dataset")) %>%
# nrow == 0){
# stop(
# "
# The harmonization process has been interupted because some mismatch between
# dataset(s) and Data Processing Elements have been found. Plese make
# sure Data Processing Elements (such as input_dataset) match names
# of your dataset(s)
#
# input dataset(s) names:\n",no_dpe,"\n",
# "\n and input_dataset(s) in dataprocessing elements:\n",
# no_dossier,
# "\n\n" )}
#
# if(nchar(no_dpe)){
# warning(
# "\nNo Data Processing Elements found for:\n",no_dpe,
# "\nThese dataset will not be harmonized.\n", call. = FALSE)
#
# }
#
# if(nchar(no_dossier)){
# warning(
# "\nNo dataset found for:\n",no_dossier,
# "\nThese Data Processing Elements have not been processed.\n",
# call. = FALSE)
#
# }
}
#' @title
#' Generate an assessment report for a DataSchema
#'
#' @description
#' Assesses the content and structure of a DataSchema object and generates
#' reports of the results. This function can be used to evaluate data structure,
#' presence of specific fields, coherence across elements, and data dictionary
#' formats.
#'
#' @details
#' A DataSchema is the list of core variables to generate across datasets and
#' related metadata. A DataSchema object is a list of data frames with elements
#' named 'Variables' (required) and 'Categories' (if any). The 'Variables'
#' element must contain at least the `name` column, and the 'Categories'
#' element must contain at least the `variable` and `name` columns to be usable
#' in any function. In 'Variables' the `name` column must also have unique
#' entries, and in 'Categories' the combination of `variable` and `name` columns
#' must also be unique.
#'
#' A taxonomy is a classification schema that can be defined for variable
#' attributes. A taxonomy is usually extracted from an
#' [Opal environment](https://www.obiba.org/pages/products/opal//), and a
#' taxonomy object is a data frame that must contain at least the columns
#' `taxonomy`, `vocabulary`, and `terms`. Additional details about Opal
#' taxonomies are
#' [available online](https://opaldoc.obiba.org/en/latest/web-user-guide/administration/taxonomies.html).
#'
#' @param dataschema A DataSchema object.
#' @param taxonomy An optional data frame identifying a variable classification
#' schema.
#'
#' @returns
#' A list of data frames containing assessment reports.
#'
#' @examples
#' {
#'
#' # use Rmonize_DEMO provided by the package
#'
#' library(dplyr)
#' library(madshapR) # data_dict_filter
#'
#' dataschema <-
#' Rmonize_DEMO$`dataschema - final` %>%
#' data_dict_filter("name == 'adm_unique_id'")
#'
#' dataschema_evaluate(dataschema)
#'
#' }
#'
#' @import dplyr haven
#' @importFrom rlang .data
#'
#' @export
dataschema_evaluate <- function(dataschema, taxonomy = NULL){
# dataschema <-
# as_dataschema(dataschema,as_dataschema_mlstr = TRUE) %>%
# as_data_dict_mlstr()
report <- data_dict_evaluate(dataschema,taxonomy,as_data_dict_mlstr = TRUE)
names(report) <- str_replace(names(report),"Data dictionary summary",
"Harmonized Data dictionary summary")
names(report) <- str_replace(names(report),"Data dictionary assessment",
"Harmonized Data dictionary assessement")
# dossier_name <- tibble(dossier = as.character(), dataset = as.character())
# for(i in names(dossier)) for(j in names(dossier[[i]])){
# dossier_name <- dossier_name %>% add_row(dossier = i, dataset = j)}
#
# dpe_name <- tibble(dossier = as.character(), dataset = as.character())
# for(i in names(data_proc_elem)) for(j in names(data_proc_elem[[i]])){
# dpe_name <- dpe_name %>% add_row(dossier = i, dataset = j)}
#
# no_dpe <-
# anti_join(dossier_name, dpe_name,by = c("dossier", "dataset")) %>%
# group_by(.data$`dossier`) %>%
# summarise(dataset = paste(dataset, collapse = " - ")) %>%
# ungroup() %>%
# unite("value", .data$`dossier`, .data$`dataset`, sep = " : ") %>%
# summarise(value = paste(.data$`value`, collapse = " \n")) %>% pull
#
# no_dossier <-
# anti_join(dpe_name, dossier_name,by = c("dossier", "dataset")) %>%
# group_by(.data$`dossier`) %>%
# summarise(dataset = paste(dataset, collapse = " - ")) %>%
# ungroup() %>%
# unite("value", .data$`dossier`, .data$`dataset`, sep = " : ") %>%
# summarise(value = paste(.data$`value`, collapse = " \n")) %>% pull
#
# if(inner_join(dossier_name, dpe_name,by = c("dossier", "dataset")) %>%
# nrow == 0){
# stop(
# "
# The harmonization process has been interupted because some mismatch between
# dataset(s) and Data Processing Elements have been found. Plese make
# sure Data Processing Elements (such as input_dataset) match names
# of your datasets
#
# dossier(s) and input dataset(s) names:\n",no_dpe,"\n",
# "\n and input_dataset(s) in dataprocessing elements:\n",
# no_dossier,
# "\n\n" )}
#
# if(nchar(no_dpe)){
# warning(
# "\nNo Data Processing Elements found for:\n",no_dpe,
# "\nThese dataset will not be harmonized.\n", call. = FALSE)
#
# }
#
# if(nchar(no_dossier)){
# warning(
# "\nNo dataset found for:\n",no_dossier,
# "\nThese Data Processing Elements have not been processed.\n",
# call. = FALSE)
#
# }
return(report)
}