-
Notifications
You must be signed in to change notification settings - Fork 12
/
pls-modeling.R
1100 lines (1043 loc) · 42.3 KB
/
pls-modeling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
## Perform calibration sampling based on spectral PCA
## or random split -------------------------------------------------------------
split_data_q <- function(
spec_chem,
split_method,
evaluation_method = "test_set",
ratio_val,
ken_sto_pc = 2,
print = TRUE,
invert = FALSE, env = parent.frame()) {
MIR <- model <- type <- PC1 <- PC2 <- NULL
# Evaluate the invert argument in the parent function (fit_pls)
invert <- eval(invert, envir = parent.frame())
# Evaluate the validation argument in the parent function (fit_pls)
evaluation_method <- eval(evaluation_method, envir = parent.frame())
# Slice based on sample_id if spectral data is in tibble class
if (tibble::is_tibble(spec_chem)) {
spec_chem <- spec_chem %>%
dplyr::group_by(!!rlang::sym("sample_id")) %>%
dplyr::slice(1L)
}
if (evaluation_method == "test_set") {
# pc = 0.99 before !!!
ken_sto_pc <- eval(ken_sto_pc, envir = parent.frame())
if (invert == FALSE) {
## Select calibration set by Kennard-Stones algorithm
# Check if tibble; if yes slice tibble and bind list of data.tables in
# one data table for spectral data
if (tibble::is_tibble(spec_chem)) {
spc_pre <- as.matrix(data.table::rbindlist(spec_chem$spc_pre))
# k = number of samples to select
# ken_sto_pc = if provided, the number of principal components
# (see ?kenStone)
sel <- prospectr::kenStone(
X = spc_pre,
k = round((1 - ratio_val) * nrow(spec_chem)),
pc = substitute(ken_sto_pc)
)
} else {
sel <- prospectr::kenStone(
X = spec_chem$MIR,
k = round((1 - ratio_val) * nrow(spec_chem)),
pc = substitute(ken_sto_pc)
)
}
# Split MIR data into calibration and validation set using
# the results of Kennard-Stone Calibration Sampling
# Selct by row index of calibration samples
val_set <- spec_chem[-sel$model, ]
cal_set <- spec_chem[sel$model, ]
# Optionally split up calibation (train) and validation (test) sets
# randomly; use function from modelr package
# !!! Important note: The option to split up the calibration and
# sets randomly is still experimental and a modification for the
# PC space projection is not yet implemented for graphical output.
# p_pc ggplot2 output needs to be updated for split_method = "random"
if (split_method == "random") {
# Split data sets into test and traing using modelr package
df_split <- modelr::crossv_mc(spec_chem, n = 1, test = ratio_val)
# Select train of df_split and convert back into tibble,
# assign to calibration set
cal_set <- tibble::as_tibble(df_split[1, ][["train"]][[1]])
# Select test of df_split and convert back into tibble,
# assign to validation set
val_set <- tibble::as_tibble(df_split[1, ][["test"]][[1]])
}
sel_df_cal <- data.frame(sel$pc[sel$model, 1:2])
sel_df_val <- data.frame(sel$pc[-sel$model, 1:2])
} else {
if (tibble::is_tibble(spec_chem)) {
spc_pre <- as.matrix(data.table::rbindlist(spec_chem$spc_pre))
sel <- prospectr::kenStone(
X = spc_pre,
k = round(ratio_val * nrow(spec_chem)),
pc = substitute(ken_sto_pc)
)
} else {
## Select validation set by Kennard-Stones algorithm
sel <- prospectr::kenStone(
X = spec_chem$MIR,
k = round(ratio_val * nrow(spec_chem)),
pc = substitute(ken_sto_pc)
)
}
sel_df_cal <- data.frame(sel$pc[-sel$model, 1:2])
sel_df_val <- data.frame(sel$pc[sel$model, 1:2])
# Split MIR data into calibration and validation set using
# the results of Kennard-Stone Calibration Sampling
# Selct by row index of calibration samples
val_set <- spec_chem[sel$model, ]
cal_set <- spec_chem[-sel$model, ]
}
# Add additional columns to calibration set and validation sets for plotting
sel_df_cal$type <- as.factor(
rep("calibration", nrow(sel_df_cal))
)
sel_df_val$type <- as.factor(
rep("validation", nrow(sel_df_val))
)
sel_df <- rbind(sel_df_cal, sel_df_val)
# Compute ratio needed to make the figure square
ratio <- with(sel_df, diff(range(PC1)) / diff(range(PC2)))
# Save graph showing the selected calibration and validation samples
# for the first two principal components (pc)
p_pc <- ggplot2::ggplot(data = sel_df) +
ggplot2::geom_point(
ggplot2::aes(x = PC1, y = PC2, shape = type),
size = 4
) +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::scale_shape_manual(values = c(1, 19)) +
ggplot2::scale_colour_manual(values = c("black", "red")) +
ggplot2::theme_bw() +
ggplot2::theme(legend.title = ggplot2::element_blank())
# Print outputs to list
list_out <- list(
calibration = cal_set,
validation = val_set,
p_pc = p_pc
)
list_out
# Check number of observations (rows) for calibration set
# nrow(cal_set)
} else {
cal_set <- spec_chem
list(
calibration = cal_set,
validation = NULL,
p_pc = NULL
)
}
}
# trainControl generating helper function
control_train_q <- function(x, response, resampling_seed,
env = parent.frame()) {
calibration <- NULL
# List of calibration and validation samples
# set up a cross-validation scheme
# create 10 folds that we will keep for the different
# modeling approaches to allow comparison
# randomly break the data into 10 partitions
# note that k is the total number of samples for leave-one-out
# use substitute function to make non-standard evaluation
# of variable argument (looks at a function as argument,
# sees code used to compute value;
# see chapter 13.1 Capturing expressions
# in Advanced R (Hadley Wickham)
# !! p. 270
response <- eval(response, x$calibration, env)
# Set seed for creating resampling indices
set.seed(eval(resampling_seed, env))
idx <- caret::createFolds(y = response, k = 10, returnTrain = TRUE)
# inject the index in the trainControl object
caret::trainControl(
method = "cv", index = idx,
savePredictions = TRUE, selectionFunction = "oneSE"
)
}
## Adapt model tuning to leave-one-out cross-validation ========================
## trainControl generating helper function
control_train_loocv_q <- function(x, response, env = parent.frame()) {
calibration <- NULL
# r: response
response <- eval(response, x$calibration, env)
# Set up leave-one-out cross-validation
caret::trainControl(
method = "LOOCV", savePredictions = TRUE,
selectionFunction = "oneSE"
)
}
## Adapt model tuning to repeated k-fold cross-validation ======================
## trainControl generating helper function
control_train_rcv_q <- function(x, response, resampling_seed,
env = parent.frame()) {
calibration <- NULL
# r: response
response <- eval(response, x$calibration, env)
# Set seed for creating resampling indices
set.seed(eval(resampling_seed, env))
# Set up 5 times repeated 10-fold cross-validation
idx <- caret::createMultiFolds(y = response, k = 10, times = 5) # update ***
# Inject the index in the trainControl object
caret::trainControl(
method = "repeatedcv", index = idx,
savePredictions = TRUE, selectionFunction = "oneSE"
)
}
## Fitting models without parameter tuning =====================================
# 5.9; https://topepo.github.io/caret/model-training-and-tuning.html
## trainControl generating helper function
control_train_none_q <- function(x, response, resampling_seed,
env = parent.frame()) {
calibration <- NULL
# r: response
response <- eval(response, x$calibration, env)
# Set seed for creating resampling indices
set.seed(eval(resampling_seed, env))
# Set trainControl argument to "none" so that caret::train will only fit
# one model to the entire training set;
# use a fixed number of PLS components instead
idx <- caret::createFolds(y = response, k = 10, returnTrain = TRUE) # update ***
# inject the index in the trainControl object
caret::trainControl(
method = "none", index = idx, savePredictions = TRUE,
selectionFunction = "oneSE"
)
}
## Standard evlauation version of trainContol helper function
control_train <- function(x, response, env = parent.frame()) {
control_train_q(x, substitute(response), env)
}
# Fit a PLS regression model using the caret package ---------------------------
## Train a PLS regression model
train_pls_q <- function(
x,
evaluation_method = "test_resampling",
response, tr_control, env = parent.frame(),
pls_ncomp_max = 20, ncomp_fixed = 5,
center, scale, tuning_method = "resampling") {
# Fit a partial least square regression (pls) model
# center and scale MIR (you can try without)
calibration <- MIR <- NULL
r <- eval(response, x$calibration, env)
# ? Is it really necessary to evaluate this in the parent frame?
pls_ncomp_max <- eval(pls_ncomp_max, envir = parent.frame())
# Evaluate fixed number of PLS regression components
# from ncomp_fixed object in parent frame (fit_pls function)
ncomp_fixed <- eval(ncomp_fixed, envir = parent.frame())
# Test whether the spectral object has the class "tibble"
if (!tibble::is_tibble(x$calibration)) {
stop("spec_chem needs to be of class tibble")
}
spc_pre <- data.table::rbindlist(x$calibration$spc_pre)
if (scale == TRUE && center == TRUE) {
if (tuning_method == "resampling") {
# Fit model with parameter tuning
pls_model <- caret::train(
x = spc_pre, y = r,
method = "pls",
tuneLength = pls_ncomp_max,
trControl = tr_control,
preProcess = c("center", "scale")
)
} else if (tuning_method == "none") {
# Fit model without parameter tuning
pls_model <- caret::train(
x = spc_pre, y = r,
method = "pls",
trControl = tr_control,
preProcess = c("center", "scale"),
tuneGrid = data.frame(ncomp = ncomp_fixed)
)
}
} else {
# No centering and scaling!
pls_model <- caret::train(
x = spc_pre, y = r,
method = "pls",
tuneLength = pls_ncomp_max,
trControl = tr_control
)
}
}
## Standard evaluation version for training a PLS regression model
train_pls <- function(
x, response, evaluation_method = "resampling",
env = parent.frame()) {
train_pls_q(
x = x, evaluation_method = substitute(evaluation_method),
response = substitute(response), env
)
}
# Fit a random forest model using the caret package ----------------------------
## Train a random forest model
train_rf_q <- function(
x,
validation = TRUE, evaluation_method = "resampling",
response, tr_control, ntree_max = 500, env = parent.frame()) {
# Fit a partial least square regression (pls) model
# center and scale MIR (you can try without)
calibration <- MIR <- NULL
response <- eval(response, x$calibration, env)
ntree_max <- eval(ntree_max, envir = parent.frame())
if (tibble::is_tibble(x$calibration)) {
spc_pre <- data.table::rbindlist(x$calibration$spc_pre)
rf_model <- caret::train(
x = spc_pre, y = response,
method = "rf",
ntree = ntree_max,
trControl = tr_control,
preProcess = c("center", "scale")
)
} else {
rf_model <- caret::train(
x = x$calibration$MIR, y = response,
method = "rf",
ntree = ntree_max,
trControl = tr_control,
preProcess = c("center", "scale")
)
}
rf_model
}
# Evaluate model performance (validation and cross-validation) -----------------
## Helper function to transform repeated k-fold cross-validation hold-out
## predictions
transform_cvpredictions <- function(cal_index, predobs_cv) {
predobs_cv <- dplyr::full_join(cal_index, predobs_cv, by = "rowIndex") %>%
dplyr::group_by(!!rlang::sym("sample_id")) %>%
# Average observed and predicted values
dplyr::mutate(
"obs" = mean(!!rlang::sym("obs")),
"pred_sd" = sd(!!rlang::sym("pred"))
) %>%
# Add 95% confidence interval for mean hold-out predictions from
# repeated k-fold cross-validation
dplyr::mutate_at(
.vars = dplyr::vars(!!rlang::sym("pred")),
.funs = dplyr::funs("pred_sem_ci" = sem_ci)
) %>%
# Add mean hold-out predictions from repeated k-fold cross-validation
dplyr::mutate("pred" = mean(!!rlang::sym("pred"))) %>%
# Slice data set to only have one row per sample_id
dplyr::slice(1L)
}
## Evaluate PLS performance
evaluate_model_q <- function(
x, model, response,
evaluation_method, tuning_method, resampling_method,
print = TRUE, env = parent.frame()) {
# Set global variables to NULL to avoid R CMD check notes
MIR <- object <- dataType <- obs <- pred_sem_ci <- pred <- NULL
ncomp <- finalModel <- rmse <- r2 <- r2 <- rpd <- n <- NULL
rmse <- calibration <- NULL
# Collect fitted object into a list
list_models <- list("final_model" = model)
# Evaluate validation argument in parent.frame !!!
evaluation_method <- eval(evaluation_method, envir = parent.frame())
# Evaluate tuning_method argument in parent.frame
tuning_method <- eval(tuning_method, envir = parent.frame())
# Evaluate resampling_method argument in parent.frame
resampling_method <- eval(resampling_method, envir = parent.frame())
# Extract best tuning parameters and associated cv predictions
if (evaluation_method == "test_set") {
# Calculate training (calibration) and test (validation) data
# predictions based on pls model with calibration data
r <- eval(response, x$validation, env)
if (!tibble::is_tibble(x$validation)) {
stop("Spectra and reference data need to be provided as tibble
(class `tbl_df`, `tbl`, `data.frame`")
}
spc_pre <- data.table::rbindlist(x$validation$spc_pre)
predobs <- caret::extractPrediction(list_models,
testX = spc_pre, testY = r
) # update ***
# Append sample_id column to predobs data.frame
# extract sample_id from validation set
predobs$sample_id <- c(
x$calibration$sample_id, x$validation$sample_id
)
# Create new data frame column <object>
predobs$object <- predobs$model
# Replace levels "Training" and "Test" in dataType column
# by "Calibration" and "Validation" (rename levels of factor)
predobs$dataType <- plyr::revalue(
predobs$dataType,
c("Test" = "Validation", "Training" = "Calibration")
)
# Change the order of rows in the data frame
# Calibration as first level (show Calibration in ggplot graph
# on left panel)
predobs$dataType <- factor(predobs$dataType,
levels = c("Calibration", "Validation")
)
# Calculate model performance indexes by model and dataType
# uses package plyr and function summary.df of SPECmisc.R
stats <- plyr::ddply(
predobs, c("model", "dataType"),
function(x) summary_df(x, "obs", "pred")
)
# Check whether method = "none" argument is selected in train();
# this is the case when ncomp_fixed argument in fit_pls() is
# evaluated
# Checking for the existence of a <pred> element in the train function output
# list can be dangerous and doesn't work in all cases when using
# e.g. is.element('pred', x) or is.null(x$pred);
# Problems can occur e.g. if a list element contains NULL element;
# see
# http://stackoverflow.com/questions/7719741/how-to-test-if-list-element-exists
} else if (evaluation_method == "resampling" && tuning_method == "resampling") {
# Good discussion on which cross-validation results are returned from caret
# Extract best tuning parameters and associated cv predictions
# http://stats.stackexchange.com/questions/219154/how-does-cross-validation-in-train-caret-precisely-work
# Alternative solution for one model: conformal::GetCVPreds(model) function
# see https://github.com/cran/conformal/blob/master/R/misc.R
predobs_cv <- plyr::ldply(list_models,
function(x) dplyr::inner_join(x$pred, x$bestTune, by = "ncomp"),
.id = "model"
)
# Extract auto-prediction
predobs <- caret::extractPrediction(list_models)
# !!! new ---
# Replace levels "Training" dataType column
# by "Calibration" (rename levels of factor)
predobs$dataType <- plyr::revalue(
predobs$dataType,
c("Training" = "Calibration")
)
# Append sample_id column to predobs data.frame
# extract sample_id from calibration set
predobs$sample_id <- x$calibration$sample_id
# Create rowIndex for calibration tibble
x$calibration$rowIndex <- 1:nrow(x$calibration)
# Generate sample_id column for rowIndex of pred list element of
# train object; select only rowIndex and sample_id of calibration tibble
vars_indexing <- c("rowIndex", "sample_id")
cal_index <- dplyr::select(
x$calibration,
!!!rlang::syms(vars_indexing)
)
# Transform cross-validation hold-out predictions --------------------------
predobs_cv <- transform_cvpredictions(
cal_index = cal_index,
predobs_cv = predobs_cv
)
predobs_cv$object <- predobs_cv$model
predobs_cv$model <- factor(predobs_cv$model)
predobs_cv$dataType <- factor("Cross-validation")
vars_keep <- c(
"obs", "pred", "pred_sd", "pred_sem_ci",
"model", "dataType", "object"
)
predobs_cv <- dplyr::select(
predobs_cv,
# !!! sample_id newly added
!!!rlang::syms(vars_keep)
)
# Add column pred_sd to predobs data frame (assign values to 0) so that
# column pred_sd is retained in predobs_cv after dplyr::bind_rows
predobs$pred_sd <- NA
# Desn't work because some columns are turned into numeric;
# resulting data frame has only two rows
# pb_2018-11-09: Model evaluation graph should only show cross-validation
# results when arguments `evaluation_method` == "resampling" &&
# `tuning_method` == "resampling"
predobs <- predobs_cv
# predobs <- dplyr::bind_rows(predobs, predobs_cv)
# Calculate model performance indexes by model and dataType
# uses package plyr and function summary.df of SPECmisc.R
stats <- suppressWarnings(plyr::ddply(
predobs, c("model", "dataType"),
function(x) summary_df(x, "obs", "pred")
))
}
# Add number of components to stats; from finalModel list item
# from train() function output (function from caret package)
stats$ncomp <- rep(model$finalModel$ncomp, nrow(stats))
# !!! Experimental: return stats
# return(stats)
# Add range of observed values for validation and calibraton
# get range from predicted vs. observed data frame
# stored in object predobs
obs_cal <- subset(predobs, dataType == "Calibration")$obs
# Get name of predicted variable; see p. 261 of book
# "Advanced R" (Hadley Wickham)
response_name <- deparse(response)
if (evaluation_method == "test_set") {
# Assign validation set to separate data frame
obs_val <- subset(predobs, dataType == "Validation")$obs
# before: deparse(substitute(variable))
df_range <- data.frame(
response = rep(response_name, 2),
dataType = c("Calibration", "Validation"),
min_obs = c(range(obs_cal)[1], range(obs_val)[1]),
median_obs = c(median(obs_cal), median(obs_val)),
max_obs = c(range(obs_cal)[2], range(obs_val)[2]),
mean_obs = c(mean(obs_cal), mean(obs_val)),
CV = c(
sd(obs_cal) / mean(obs_cal) * 100,
sd(obs_val) / mean(obs_val) * 100
)
)
} else if (evaluation_method == "resampling" && tuning_method == "resampling") {
# Assign cross-validation set to separate data frame
obs_val <- subset(predobs, dataType == "Cross-validation")$obs
df_range <- data.frame(
response = rep(response_name, 2),
dataType = factor(c("Calibration", "Cross-validation")),
min_obs = c(range(obs_cal)[1], range(obs_val)[1]),
median_obs = c(median(obs_cal), median(obs_val)),
max_obs = c(range(obs_cal)[2], range(obs_val)[2]),
mean_obs = c(mean(obs_cal), mean(obs_val)),
CV = c(
sd(obs_cal) / mean(obs_cal) * 100,
sd(obs_val) / mean(obs_val) * 100
)
)
}
# Join stats with range data frame (df_range)
stats <- suppressWarnings(dplyr::inner_join(stats, df_range, by = "dataType"))
annotation <- plyr::mutate(stats,
rmse = as.character(as.expression(paste0(
"RMSE == ",
round(rmse, 2)
))),
r2 = as.character(as.expression(paste0(
"italic(R)^2 == ",
round(r2, 2)
))),
rpd = as.character(as.expression(paste(
"RPD == ",
round(rpd, 2)
))),
n = as.character(as.expression(paste0("italic(n) == ", n))),
ncomp = as.character(as.expression(paste0(
"ncomp == ",
ncomp
)))
)
# Plot predicted vs. observed values and model indexes
# update label, xlim, and ylim ***
# Add label number of samples to facet_grid using a
# labeling function
# ! Update labeller API:
# https://github.com/hadley/ggplot2/commit/ef33dc7
# http://sahirbhatnagar.com/facet_wrap_labels
# Prepare lookup character vector
make_label <- function(x, evaluation_method = "test_set",
resampling_method = "kfold_cv") {
dataType <- n <- NULL
if (evaluation_method == "test_set") {
c(
`Calibration` = paste0(
"Calibration", "~(",
x[x$dataType == "Calibration", ]$n, ")"
),
`Validation` = paste0(
"Validation", "~(",
x[x$dataType == "Validation", ]$n, ")"
)
)
} else if (evaluation_method == "resampling" &&
resampling_method == "rep_kfold_cv") {
c(
`Calibration` = paste0(
"Calibration", "~(",
x[x$dataType == "Calibration", ]$n, ")"
),
`Cross-validation` = paste0(
"5%*%repeated~10*-fold~CV", "~(",
x[x$dataType == "Cross-validation", ]$n, ")"
)
)
} else {
c(
`Calibration` = paste0(
"Calibration", "~(",
x[x$dataType == "Calibration", ]$n, ")"
),
`Cross-validation` = paste0(
"10*-fold~CV", "~(",
x[x$dataType == "Cross-validation", ]$n, ")"
)
)
}
}
if (evaluation_method == "test_set") {
label_validation <- make_label(
x = annotation,
evaluation_method = "test_set"
)
} else if (evaluation_method == "resampling" &&
resampling_method == "rep_kfold_cv") {
label_validation <- make_label(
x = annotation,
evaluation_method = "resampling", resampling_method = "rep_kfold_cv"
)
} else {
label_validation <- make_label(
x = annotation,
evaluation_method = "resampling"
)
}
# Rename labels on the fly with a lookup character vector
to_string <- ggplot2::as_labeller(
x = label_validation, ggplot2::label_parsed
)
# Create model evaluation plot -----------------------------------------------
## ggplot graph for model comparison
## (arranged later in panels)
x_label <- paste0(
"Observed ",
as.character(response_name)
)
y_label <- paste0(
"Predicted ",
as.character(response_name)
)
## Create x and y minimum and maximum for plotting range; use either
## observed or predicted data, depending on what minimum and maximum values
## are
xy_min <- if (min(predobs$obs) < min(predobs$pred)) {
predobs$obs
} else {
predobs$pred
}
xy_max <- if (max(predobs$obs) > max(predobs$pred)) {
predobs$obs
} else {
predobs$pred
}
xy_range <- ifelse(diff(range(xy_min) > diff(range(xy_max))),
diff(range(xy_min)), diff(range(xy_max))
)
if (model$method == "pls") {
p_model <- ggplot2::ggplot(data = predobs) +
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
shape = 1, size = 2, alpha = 1 / 2, data = predobs
) +
ggplot2::geom_text(
data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = ncomp), size = 5,
hjust = 1.15, vjust = -4.5, parse = TRUE
) + # !!! additional label
ggplot2::geom_text(
data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 5,
hjust = 1.15, vjust = -3, parse = TRUE
) +
ggplot2::geom_text(
data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 5,
hjust = 1.12, vjust = -2.5, parse = TRUE
) +
ggplot2::geom_text(
data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 5,
hjust = 1.15, vjust = -1.25, parse = TRUE
) +
ggplot2::facet_grid(~dataType,
labeller = ggplot2::as_labeller(to_string)
) +
ggplot2::geom_abline(col = "red") +
ggplot2::labs(x = x_label, y = y_label) +
ggplot2::xlim(c(
min(xy_min) - 0.05 * xy_range,
max(xy_max) + 0.05 * xy_range
)) +
ggplot2::ylim(c(
min(xy_min) - 0.05 * xy_range,
max(xy_max) + 0.05 * xy_range
)) +
ggplot2::coord_fixed() +
ggplot2::theme_bw() +
ggplot2::theme(strip.background = ggplot2::element_rect(fill = "white"))
if (evaluation_method == "resampling") {
p_model <- p_model +
ggplot2::geom_errorbar(
ggplot2::aes(
x = obs, ymin = pred - pred_sem_ci,
ymax = pred + pred_sem_ci
),
width = 0, data = predobs, inherit.aes = FALSE
)
}
} else {
p_model <- ggplot2::ggplot(data = predobs) +
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
shape = 1, size = 2, alpha = 1 / 2
) + # without ncomp label
ggplot2::geom_text(
data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 5,
hjust = 1.15, vjust = -3, parse = TRUE
) +
ggplot2::geom_text(
data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 5,
hjust = 1.12, vjust = -2.5, parse = TRUE
) +
ggplot2::geom_text(
data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 5,
hjust = 1.15, vjust = -1.25, parse = TRUE
) +
ggplot2::facet_grid(~dataType,
labeller = ggplot2::as_labeller(to_string)
) +
ggplot2::geom_abline(col = "red") +
ggplot2::labs(x = x_label, y = y_label) +
ggplot2::xlim(c(
min(xy_min) - 0.05 * xy_range,
max(xy_max) + 0.05 * xy_range
)) +
ggplot2::ylim(c(min(xy_min) -
0.05 * xy_range, max(xy_max) + 0.05 * xy_range)) +
ggplot2::coord_fixed() +
ggplot2::theme_bw()
}
if (print == TRUE) {
print(p_model)
}
list(stats = stats, p_model = p_model, predobs = predobs)
}
## PLS regression modeling in one function ======================
#' @name fit_pls
#' @title Calibration sampling, model tuning, and PLS regression
#' @description Perform calibration sampling and use selected
#' calibration set for model tuning
#' @param spec_chem Tibble that contains spectra, metadata and chemical
#' reference as list-columns. The tibble to be supplied to \code{spec_chem} can
#' be generated by the `join_chem_spc() function`
#' @param response Response variable as symbol or name
#' (without quotes, no character string). The provided response symbol needs to be
#' a column name in the \code{spec_chem} tibble.
#' @param variable Depreciated and replaced by `response`
#' @param center Logical whether to perform mean centering of each spectrum column
#' (e.g. wavenumber or wavelength) after common spectrum preprocessing. Default is
#' \code{center = TRUE}
#' @param scale Logical whether to perform standard deviation scaling
#' of each spectrum column (e.g. wavenumber or wavelength) after common
#' spectrum preprocessing. Default is \code{scale = TRUE}
#' @param evaluation_method Character string stating evaluation method.
#' Either \code{"test_set"} (default) or \code{"resampling"}. \code{"test_set"}
#' will split the data into a calibration (training) and validation (test) set,
#' and evaluate the final model by predicting on the validation set.
#' If \code{"resampling"}, the finally selected model will be evaluated based
#' on the cross-validation hold-out predictions.
#' @param validation Depreciated and replaced by \code{evaluation_method}.
#' Default is \code{TRUE}.
#' @param split_method Method how to to split the data into a independent test
#' set. Default is \code{"ken_sto"}, which will select samples for calibration
#' based on Kennard-Stone sampling algorithm of preprocessed spectra. The
#' proportion of validation to the total number of samples can be specified
#' in the argument \code{ratio_val}.
#' \code{split_method = "random"} will create a single random split.
#' @param ratio_val Ratio of validation (test) samples to
#' total number of samples (calibration (training) and validation (test)).
#' @param ken_sto_pc Number of component used
#' for calculating mahalanobsis distance on PCA scores for computing
#' Kennard-Stone algorithm.
#' Default is \code{ken_sto_pc = 2}, which will use the first two PCA
#' components.
#' @param pc Depreciated; renamed argument is `ken_sto_pc`.
#' @param invert Logical
#' @param tuning_method Character specifying tuning method. Tuning method
#' affects how caret selects a final tuning value set from a list of candidate
#' values. Possible values are \code{"resampling"}, which will use a
#' specified resampling method such as repeated k-fold cross-validation (see
#' argument \code{resampling_method}) and the generated performance profile
#' based on the hold-out predictions to decide on the final tuning values
#' that lead to optimal model performance. The value \code{"none"} will force
#' caret to compute a final model for a predefined canditate PLS tuning
#' parameter number of PLS components. In this case, the value
#' supplied by \code{ncomp_fixed}` is used to set model complexity at
#' a fixed number of components.
#' @param resampling_method Character specifying resampling method. Currently,
#' \code{"kfold_cv"} (default, performs 10-fold cross-validation),
#' \code{"rep_kfold_cv"} (performs 5-times repeated 10-fold cross-validation),
#' \code{"loocv"} (performs leave-one-out cross-validation), and \code{"none"}
#' (if \code{resampling_method = "none"}) are supported.
#' @param resampling_seed Random seed (integer) that will be used for generating
#' resampling indices, which will be supplied to \code{caret::trainControl}.
#' This makes sure that modeling results are constant when re-fitting.
#' Default is \code{resampling_seed = 123}.
#' @param cv Depreciated. Use \code{resampling_method} instead.
#' @param pls_ncomp_max Maximum number of PLS components that are evaluated
#' by caret::train. Caret will aggregate a performance profile using resampling
#' for an integer sequence from 1 to \code{pls_ncomp_max}
#' @param ncomp_fixed Integer of fixed number of PLS components. Will only be
#' used when \code{tuning_method = "none"} and \code{resampling_method = "none"}
#' are used.
#' @param print Logical expression whether model evaluation graphs shall be
#' printed
#' @param env Environment where function is evaluated. Default is
#' \code{parent.frame}.
#' @export
# Note: check non standard evaluation, argument passing...
fit_pls <- function(
spec_chem,
response, variable = NULL, # variable depreciated, will not work
center = TRUE, scale = TRUE, # center and scale all predictors (wavenumbers)
evaluation_method = "test_set", validation = TRUE, # validation depreciated
split_method = "ken_stone",
ratio_val = 1 / 3, # is only used if evaluation_method = "test_set"
ken_sto_pc = 2, pc, # only if split_method = "ken_stone"; number of component
# used for calculating mahalanobsis distance on PCA scores. pc is depreciated.
invert = TRUE, # only if split_method = "ken_stone"
tuning_method = "resampling",
resampling_method = "kfold_cv", cv = NULL, # cv depreciated
resampling_seed = 123, # Seed for creating resampling indices
pls_ncomp_max = 20, # Maximal number of PLS components used by model tuning
ncomp_fixed = 5, # only fit and evaluate one model, if tuning_method = "none"
print = TRUE, # print model summary and evaluation graphs
env = parent.frame()) {
calibration <- 0
# Warning messages and reassignment for depreciated arguments ----------------
# Depreciate argument variable, use more specific term for the response
# to be predicted by spectral modeling
if (!is.null(variable)) {
stop("argument variable has been replaced by response for simplerspec_0.1.0")
}
# 20170602: revise argument name and values of validation;
# Replace validation = TRUE or FALSE with
# new argument evaluation_method = "test_set" or "resampling"
if (!missing(validation)) {
warning("argument validation is depreciated; please use evaluation_method instead.",
call. = FALSE
)
evaluation_method <- validation
}
# Depreciate argument pc, use more consistent and verbose argument ken_sto_pc
if (!missing(pc)) {
warning("argument pc is depreciated; please use ken_sto_pc instead.",
call. = FALSE
)
ken_sto_pc <- pc
}
# Depreciate argument cv, use more consistent and verbose argument
# resampling_method
if (!missing(cv)) {
warning("argument cv is depreciated; please use resampling_method instead.",
call. = FALSE
)
resampling_method <- cv
}
# Change values for resampling_method argument
if (resampling_method == "LOOCV") {
warning("value 'LOOCV' (leave one out cross-validation) for argument resampling_method is depreciated; please use value 'loocv' instead.")
resampling_method <- "loocv"
}
if (resampling_method == "repeatedcv") {
warning("value 'repeatedcv' (repeated k-fold cross-validation) for argument resampling_method is depreciated; please use value 'rep_kfold_cv' instead.")
resampling_method <- "rep_kfold_cv"
}
# Perform calibration sampling -----------------------------------------------
list_sampled <- split_data_q(
spec_chem, split_method,
ratio_val = ratio_val,
ken_sto_pc = substitute(ken_sto_pc),
evaluation_method = substitute(evaluation_method),
invert = substitute(invert)
)
# Check on method for cross-validation to be used in caret model tuning ------
if (resampling_method == "loocv") {
# leave-one-out cross-validation
tr_control <- control_train_loocv_q(
x = list_sampled,
response = substitute(response), env = env
)
} else if (resampling_method == "rep_kfold_cv") {
# repeated k-fold cross-validation
tr_control <- control_train_rcv_q(
x = list_sampled,
response = substitute(response),
resampling_seed = substitute(resampling_seed), env = env
)
} else if (resampling_method == "none") {
# no resampling; calls caret::train(..., method = "none");
# fixed number of PLS components; tuning_method argument has also
# to be set to "none"
tr_control <- control_train_none_q(
x = list_sampled,
response = substitute(response),
resampling_seed = substitute(resampling_seed), env = env
)
} else if (resampling_method == "kfold_cv") {
# k-fold cross validation
tr_control <- control_train_q(
x = list_sampled,
response = substitute(response),
resampling_seed = substitute(resampling_seed), env = env
)
}
# Fit a pls calibration model; pls object is output from caret::train() ------
if (tuning_method == "resampling") {
pls <- train_pls_q(
x = list_sampled,
evaluation_method = "test_set",
response = substitute(response), tr_control = tr_control,
center = center, scale = scale,
pls_ncomp_max = substitute(pls_ncomp_max), env
)
} else if (tuning_method == "none") {
pls <- train_pls_q(
x = list_sampled,
evaluation_method = "test_set",
response = substitute(response), tr_control = tr_control,
center = center, scale = scale, tuning_method = "none",
ncomp_fixed = substitute(ncomp_fixed), env
)
}
# Evaluate model accuracy (predicted vs. observed) ---------------------------
stats <- evaluate_model_q(
x = list_sampled, model = pls,
response = substitute(response),
evaluation_method = substitute(evaluation_method),
tuning_method = substitute(tuning_method),
resampling_method = substitute(resampling_method),
env = parent.frame()
)
list(
data = list_sampled, p_pc = list_sampled$p_pc,
model = pls, stats = stats$stats, p_model = stats$p_model,
predobs = stats$predobs
)
}
## Old function name of `fit_pls`: `pls_ken_stone`
#' @rdname fit_pls
#' @export
pls_ken_stone <- fit_pls
## Random forest modeling in one function =======================
#' @title Calibration sampling, and random forest model tuning and evaluation
#' @description Perform calibration sampling and use selected
#' calibration set for model tuning
#' @param spec_chem Tibble that contains spectra, metadata and chemical
#' reference as list-columns. The tibble to be supplied to \code{spec_chem} can
#' be generated by the `join_chem_spc() function`
#' @param response Response variable as symbol or name
#' (without quotes, no character string). The provided response symbol needs to be
#' a column name in the \code{spec_chem} tibble.
#' @param variable Depreciated and replaced by `response`
#' @param evaluation_method Character string stating evaluation method.
#' Either \code{"test_set"} (default) or \code{"resampling"}. \code{"test_set"}
#' will split the data into a calibration (training) and validation (test) set,
#' and evaluate the final model by predicting on the validation set.
#' If \code{"resampling"}, the finally selected model will be evaluated based
#' on the cross-validation hold-out predictions.
#' @param validation Depreciated and replaced by \code{evaluation_method}.
#' Default is \code{TRUE}.
#' @param split_method Method how to to split the data into a independent test
#' set. Default is \code{"ken_sto"}, which will select samples for calibration
#' based on Kennard-Stone sampling algorithm of preprocessed spectra. The
#' proportion of validation to the total number of samples can be specified
#' in the argument \code{ratio_val}.
#' \code{split_method = "random"} will create a single random split.
#' @param ratio_val Ratio of validation (test) samples to
#' total number of samples (calibration (training) and validation (test)).
#' @param ken_sto_pc Number of component used
#' for calculating mahalanobsis distance on PCA scores for computing
#' Kennard-Stone algorithm.
#' Default is \code{ken_sto_pc = 2}, which will use the first two PCA
#' components.
#' @param pc Depreciated; renamed argument is `ken_sto_pc`.
#' @param invert Logical