/
sparseRBIC_sampsplit.R
110 lines (88 loc) · 3.07 KB
/
sparseRBIC_sampsplit.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#' Sample split procedure for stepwise regression
#'
#' Runs multiple on models selection procedures using RBIC to
#' achieve valid inferential results post-selection
#'
#' @param srbic_fit An object fitted by sparseRBIC_step
#' @param S Number of splitting iterations
#' @param quiet Should the display of a progress bar be silenced?
#'
#' @importFrom dplyr full_join
#' @return a list containing:
#'
#' \item{results}{a tibble containing coefficients, p-values, selection pct}
#' \item{splits}{a tibble of different split-based coefficients}
#'
#' @export
sparseRBIC_sampsplit <- function(srbic_fit, S = 100, quiet = FALSE) {
if(!quiet)
message("Note: sparseRBIC_sampsplit is currently experimental and may not behave as expected.")
n <- nrow(srbic_fit$data)
phi <- list()[1:S]
if(!quiet)
pb<-dplyr::progress_estimated(S)
for(s in 1:S) {
idx <- sample(1:n, ceiling(n/2))
bdata <- bake(srbic_fit$srprep, srbic_fit$data, everything(), -all_outcomes())
bX <- bdata[idx,]
by <- srbic_fit$srprep$outcome[idx]
## Run stepwise selection with same options as original
bsrbic_fit <- sparseRBIC_step(
pre_process = FALSE, model_matrix = bX, y = by,
direction = "forward",
family = srbic_fit$info$family,
cumulative_k = srbic_fit$info$cumulative_k,
cumulative_poly = srbic_fit$info$cumulative_poly,
pool = srbic_fit$info$pool,
poly_prefix = srbic_fit$info$poly_prefix,
int_sep = srbic_fit$info$int_sep,
message = FALSE
)
## Fit selected model on new data
b <- coef(bsrbic_fit$fit)
active <- gsub("`", "", names(b)[-1])
bX2 <- bdata[-idx,]
by2 <- srbic_fit$srprep$outcome[-idx]
ff <- as.formula(paste0("y~", paste0(active, collapse = " + ")))
new_fit <- glm(ff, data = data.frame(bX2, y = by2))
summ <- summary(new_fit)$coef
phi[[s]] <- tibble(names(coef(bsrbic_fit)), summ[,4])
names(phi[[s]]) <- c("Coefficient", paste0("s", s))
if(!quiet)
pb$tick()$print()
}
phi <- Reduce("my_join", phi)
phi_hat <- as.matrix(phi[,-1]) %>%
apply(1, median)
# phi_ci <- as.matrix(phi[,-1]) %>%
# apply(1, quantile, probs= c(0.025, .975)) %>%
# t() %>%
# as.data.frame()
bootstraps <- t(phi[,-1])
colnames(bootstraps) <- gsub("`","",phi[[1]])
bootstraps <- as_tibble(bootstraps, .name_repair = "minimal")
list(
results = tibble(
Coefficient = gsub("`","",phi[[1]]),
Median_p = phi_hat,
Mean_p = as.matrix(phi[,-1]) %>%
apply(1, mean),
Gmean_p = as.matrix(phi[,-1]) %>%
apply(1, function(x) exp(mean(log(x)))),
# Bootstrap_SE = get_bse(Ni, phi, B, phi_hat),
# p = 2 * pnorm(-abs(Bootstrap_Mean / Bootstrap_SE)),
"% selected" = 100*apply(phi[,-1], 1, function(x) mean(x != 1))
),
splits = bootstraps
)
}
my_join <- function(x, y) {
xy <- dplyr::full_join(x,y, by = "Coefficient")
xy[is.na(xy)] <- 1
xy
}
get_bse <- function(Ni, phi, B, phi_hat) {
## Calculate Efron smoothed SD
sd1 <- sqrt(((t(Ni %*% t(as.matrix(phi[,-1]) - phi_hat)) / B) ^ 2) %*% rep(1, nrow(Ni)) )
as.vector(sd1)
}