-
Notifications
You must be signed in to change notification settings - Fork 1
/
simulation.R
272 lines (257 loc) · 9.2 KB
/
simulation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#' Simulate Likert Scale Item Responses
#'
#' Simulates Likert scale item responses based on a specified number
#' of response categories and the centered parameters of the latent variable.
#'
#' @param n_levels number of response categories for the Likert scale item.
#' @param cp centered parameters of the latent variable.
#' Named vector including mean (`mu`), standard deviation (`sd`),
#' and skewness (`skew`). Skewness must be between -0.95 and 0.95.
#' @return A named vector of probabilities for each response category.
#' @examples
#' cp <- c(mu = 0, sd = 1, skew = 0.5)
#' simulate_likert(n_levels = 5, cp = cp)
#' cp2 <- c(mu = 1, sd = 2, skew = -0.3)
#' simulate_likert(n_levels = 7, cp = cp2)
#' @details
#' The simulation process uses the following model detailed by
#' Boari and Nai-Ruscone. Let \eqn{X} be the continuous variable of interest,
#' measured using Likert scale questions with \eqn{K} response categories. The
#' observed discrete variable \eqn{Y} is defined as follows:
#' \deqn{
#' Y = k, \quad \text{ if } \ \ x_{k - 1} < X \leq x_{k}
#' \quad \text{ for } \ \ k = 1, \dots, K
#' }
#' where \eqn{x_{k}}, \eqn{k = 0, \dots, K} are endpoints defined in the domain
#' of \eqn{X} such that:
#' \deqn{
#' -\infty = x_{0} < x_{1} < \dots < x_{K - 1} < x_{K} = \infty.
#' }
#' The endpoints dictate the transformation of the density
#' \eqn{f_{X}} of \eqn{X} into a discrete probability distribution:
#' \deqn{
#' \text{Pr}(Y = k) = \int_{x_{k - 1}}^{x_{k}} f_{X}(x) \, dx
#' \quad \text{ for } \ \ k = 1, \dots, K.
#' }
#'
#' The continuous latent variable is modeled using a skew normal distribution.
#' The function \code{simulate_likert} performs the following steps:
#' * Ensures the centered parameters are within the acceptable range.
#' * Converts the centered parameters to direct parameters.
#' * Defines the density function for the skew normal distribution.
#' * Computes the probabilities for each response category
#' using optimal endpoints.
#'
#' @references
#' Boari, G. and Nai Ruscone, M. (2015).
#' A procedure simulating Likert scale item responses.
#' \emph{Electronic Journal of Applied Statistical Analysis} \bold{8(3)},
#' 288–297.
#' \doi{10.1285/i20705948v8n3p288}
#'
#' @seealso \code{\link{discretize_density}} for details on how to calculate
#' the optimal endpoints.
#' @export
simulate_likert <- function(n_levels, cp) {
validate_skewness(cp[["skew"]])
dp <- convert_params(cp)
density_fn <- function(x) {
density_sn(x, dp[["xi"]], dp[["omega"]], dp[["alpha"]])
}
endp <- calc_endpoints(n_levels, cp[["skew"]])
prob <- calc_probs(density_fn, endp)
names(prob) <- paste(seq_len(n_levels))
return(prob)
}
#' Generate Random Responses
#'
#' Generates an array of random responses to Likert-type questions based on
#' specified latent variables.
#'
#' @param size number of observations.
#' @param n_items number of Likert scale items (number of questions).
#' @param n_levels number of response categories for each item.
#' Integer or vector of integers.
#' @param mean means of the latent variables.
#' Numeric or vector of numerics. Defaults to 0.
#' @param sd standard deviations of the latent variables.
#' Numeric or vector of numerics. Defaults to 1.
#' @param skew marginal skewness of the latent variables.
#' Numeric or vector of numerics. Defaults to 0.
#' @param corr correlations between latent variables.
#' Can be a single numeric value representing the same correlation for
#' all pairs, or an actual correlation matrix. Defaults to 0.
#'
#' @return A matrix of random responses with dimensions \code{size} by
#' \code{n_items}. The column names are \code{Y1, Y2, ..., Yn} where
#' \code{n} is the number of items. Each entry in the matrix represents
#' a Likert scale response, ranging from 1 to \code{n_levels}.
#' @examples
#' # Generate responses for a single item with 5 levels
#' rlikert(size = 10, n_items = 1, n_levels = 5)
#'
#' # Generate responses for three items with different levels and parameters
#' rlikert(
#' size = 10, n_items = 3, n_levels = c(4, 5, 6),
#' mean = c(0, -1, 0), sd = c(0.8, 1, 1), corr = 0.5
#' )
#'
#' # Generate responses with a correlation matrix
#' corr <- matrix(c(
#' 1.00, -0.63, -0.39,
#' -0.63, 1.00, 0.41,
#' -0.39, 0.41, 1.00
#' ), nrow = 3)
#' data <- rlikert(
#' size = 1000, n_items = 3, n_levels = c(4, 5, 6),
#' mean = c(0, -1, 0), sd = c(0.8, 1, 1), corr = corr
#' )
#'
#' @export
rlikert <- function(size, n_items, n_levels,
mean = 0, sd = 1, skew = 0, corr = 0) {
# If there's only one item, generate responses directly
if (n_items == 1) {
return(generate_responses(size, n_levels, mean, sd, skew))
}
# Ensure input values are vectors of length n_items
mean <- rep(mean, length.out = n_items)
sd <- rep(sd, length.out = n_items)
skew <- rep(skew, length.out = n_items)
n_levels <- rep(n_levels, length.out = n_items)
# Determine the correlation case
corr_case <- handle_corr_case(corr)
if (corr_case == 1) {
# Generate multiple univariate item responses without correlations
data <- matrix(nrow = size, ncol = n_items)
for (i in seq_len(n_items)) {
data[, i] <- generate_responses(
size, n_levels[i],
mean[i], sd[i], skew[i]
)
}
colnames(data) <- paste0("Y", seq_len(n_items))
return(data)
}
# Generate the correlation matrix
corr_matrix <- generate_corr_matrix(corr, corr_case, n_items)
sigma <- cor2cov(corr_matrix, sd)
# Generate latent variables
latent_variables <- generate_latent_variables(size, mean, sigma, skew)
# Discretize latent variables to generate responses
data <- matrix(nrow = size, ncol = n_items)
for (i in seq_len(n_items)) {
endp <- calc_endpoints(n_levels[i], skew[i])
data[, i] <- findInterval(latent_variables[, i], endp)
}
colnames(data) <- paste0("Y", seq_len(n_items))
return(data)
}
#' Calculate Optimal Endpoints
#'
#' Returns the optimal endpoints of intervals that transform
#' a neutral density into discrete probability distribution.
#'
#' @param n_levels integer. Number of response categories.
#' @param skew numeric. Skewness parameter of the latent variable.
#'
#' @return A numeric vector of optimal endpoints.
#' @noRd
calc_endpoints <- function(n_levels, skew) {
dp <- convert_params(c("mu" = 0, "sd" = 1, "skew" = skew))
density_fn <- function(x) {
density_sn(x, dp[["xi"]], dp[["omega"]], dp[["alpha"]])
}
endp <- discretize_density(density_fn, n_levels)[["endp"]]
return(endp)
}
#' Generate Responses
#'
#' Generates random responses for a single Likert scale item.
#'
#' @param size integer. Number of observations.
#' @param n_levels integer. Number of response categories.
#' @param mean numeric. Mean of the latent variable.
#' @param sd numeric. Standard deviation of the latent variable.
#' @param skew numeric. Skewness parameter of the latent variable.
#'
#' @return A numeric vector of simulated responses.
#' @noRd
generate_responses <- function(size, n_levels, mean, sd, skew) {
cp <- c("mu" = mean, "sd" = sd, "skew" = skew)
prob <- simulate_likert(n_levels, cp)
sample(seq_len(n_levels), size, replace = TRUE, prob = prob)
}
#' Handle Correlation Input Case
#'
#' Determines the type of correlation input provided.
#'
#' @param corr numeric or character or matrix. Correlation input.
#'
#' @return An integer representing the correlation case.
#' @noRd
handle_corr_case <- function(corr) {
if (is.numeric(corr) && !is.matrix(corr)) {
if (corr == 0) {
return(1)
}
if (corr != 0) {
return(3)
}
}
if (is.character(corr) && corr == "random") {
return(2)
}
if (is.matrix(corr)) {
return(4)
}
stop("Invalid correlation input")
}
#' Generate Correlation Matrix
#'
#' Generates the correlation matrix based on the correlation case.
#'
#' @param corr numeric or matrix. Correlation input.
#' @param corr_case integer. Correlation case identifier.
#' @param n_items integer. Number of Likert scale items.
#'
#' @return A numeric matrix representing the correlation matrix.
#' @noRd
generate_corr_matrix <- function(corr, corr_case, n_items) {
if (corr_case == 2) {
return(generate_rand_corr_matrix(n_items))
}
if (corr_case == 3) {
if (corr > 1 | corr < -1) stop("Correlation must be between -1 and 1.")
corr_matrix <- matrix(corr, n_items, n_items)
diag(corr_matrix) <- 1
return(corr_matrix)
}
if (corr_case == 4) {
return(corr)
}
}
#' Generate Latent Variables
#'
#' Generates latent variables based on the specified parameters.
#'
#' @param size number of observations.
#' @param mean means of the latent variables. Numeric vector.
#' @param sigma covariance matrix of the latent variables.
#' @param skew skewness parameters of the latent variables. Numeric vector.
#'
#' @return A matrix of generated latent variables.
#' @noRd
generate_latent_variables <- function(size, mean, sigma, skew) {
if (any(skew != 0)) { # at least one latent variable is skewed
if (!requireNamespace("sn", quietly = TRUE)) {
stop("Package \"sn\" must be installed.
Please run:\n\n\tinstall.packages(\"sn\")\n\n")
}
cp_list <- list(mean = mean, var.cov = sigma, skew = skew)
dp_list <- sn::cp2dp(cp_list, family = "SN")
return(sn::rmsn(size, dp = dp_list))
} else {
return(mvtnorm::rmvnorm(size, mean, sigma))
}
}