/
multipleGroup.R
457 lines (457 loc) · 19.2 KB
/
multipleGroup.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#' Multiple Group Estimation
#'
#' \code{multipleGroup} performs a full-information
#' maximum-likelihood multiple group analysis for any combination of dichotomous and polytomous
#' data under the item response theory paradigm using either Cai's (2010)
#' Metropolis-Hastings Robbins-Monro (MHRM) algorithm or with an EM algorithm approach. This
#' function may be used for detecting differential item functioning (DIF), thought the
#' \code{\link{DIF}} function may provide a more convenient approach. If the grouping
#' variable is not specified then the \code{dentype} input can be modified to fit
#' mixture models to estimate any latent group components.
#'
#' By default the estimation in \code{multipleGroup} assumes that the models are maximally
#' independent, and therefore could initially be performed by sub-setting the data and running
#' identical models with \code{mirt} and aggregating the results (e.g., log-likelihood).
#' However, constrains may be automatically imposed across groups by invoking various
#' \code{invariance} keywords. Users may also supply a list of parameter equality constraints
#' to by \code{constrain} argument, of define equality constraints using the
#' \code{\link{mirt.model}} syntax (recommended).
#'
#' @return function returns an object of class \code{MultipleGroupClass}
#' (\link{MultipleGroupClass-class}).
#'
#' @aliases multipleGroup
#' @param data a \code{matrix} or \code{data.frame} that consists of
#' numerically ordered data, with missing data coded as \code{NA}
#' @param model string to be passed to, or a model object returned from, \code{\link{mirt.model}}
#' declaring how the global model is to be estimated (useful to apply constraints here)
#' @param group a \code{character} or \code{factor} vector indicating group membership. If a \code{character}
#' vector is supplied this will be automatically transformed into a \code{\link{factor}} variable.
#' As well, the first level of the (factorized) grouping variable will be treated as the "reference" group
#' @param itemtype can be same type of input as is documented in \code{\link{mirt}}, however may also be a
#' \code{ngroups} by \code{nitems} matrix specifying the type of IRT models for each group, respectively.
#' Rows of this input correspond to the levels of the \code{group} input. For mixture models the rows correspond
#' to the respective mixture grouping variables to be constructed, and the IRT models should be within these
#' mixtures
#' @param invariance a character vector containing the following possible options:
#' \describe{
#' \item{\code{'free_mean'} or \code{'free_means'}}{freely estimate all latent means in all focal groups
#' (reference group constrained to a vector of 0's)}
#' \item{\code{'free_var'}, \code{'free_vars'}, \code{'free_variance'}, or \code{'free_variances'}}{
#' freely estimate all latent variances in focal groups
#' (reference group variances all constrained to 1)}
#' \item{\code{'slopes'}}{to constrain all the slopes to be equal across all groups}
#' \item{\code{'intercepts'}}{to constrain all the intercepts to be equal across all
#' groups, note for nominal models this also includes the category specific slope parameters}
#' }
#'
#' Additionally, specifying specific item name bundles (from \code{colnames(data)}) will
#' constrain all freely estimated parameters in each item to be equal across groups. This is
#' useful for selecting 'anchor' items for vertical and horizontal scaling, and for detecting
#' differential item functioning (DIF) across groups
#' @param method a character object that is either \code{'EM'}, \code{'QMCEM'}, or \code{'MHRM'}
#' (default is \code{'EM'}). See \code{\link{mirt}} for details
#' @param dentype type of density form to use for the latent trait parameters. Current options include
#' all of the methods described in \code{\link{mirt}}, as well as
#'
#' \itemize{
#' \item \code{'mixture-#'} estimates mixtures of Gaussian distributions,
#' where the \code{#} placeholder represents the number of potential grouping variables
#' (e.g., \code{'mixture-3'} will estimate 3 underlying classes). Each class is
#' assigned the group name \code{MIXTURE_#}, where \code{#} is the class number.
#' Note that internally the mixture coefficients are stored as log values where
#' the first mixture group coefficient is fixed at 0
#' }
#' @param ... additional arguments to be passed to the estimation engine. See \code{\link{mirt}}
#' for details and examples
#'
#' @author Phil Chalmers \email{rphilip.chalmers@@gmail.com}
#' @references
#' Chalmers, R., P. (2012). mirt: A Multidimensional Item Response Theory
#' Package for the R Environment. \emph{Journal of Statistical Software, 48}(6), 1-29.
#' \doi{10.18637/jss.v048.i06}
#' @seealso \code{\link{mirt}}, \code{\link{DIF}}, \code{\link{extract.group}}, \code{\link{DRF}}
#' @keywords models
#' @export multipleGroup
#' @examples
#' \dontrun{
#'
#' # single factor
#' set.seed(12345)
#' a <- matrix(abs(rnorm(15,1,.3)), ncol=1)
#' d <- matrix(rnorm(15,0,.7),ncol=1)
#' itemtype <- rep('2PL', nrow(a))
#' N <- 1000
#' dataset1 <- simdata(a, d, N, itemtype)
#' dataset2 <- simdata(a, d, N, itemtype, mu = .1, sigma = matrix(1.5))
#' dat <- rbind(dataset1, dataset2)
#' group <- c(rep('D1', N), rep('D2', N))
#'
#' # marginal information
#' itemstats(dat)
#'
#' # conditional information
#' itemstats(dat, group=group)
#'
#' mod_configural <- multipleGroup(dat, 1, group = group) #completely separate analyses
#' # limited information fit statistics
#' M2(mod_configural)
#'
#' mod_metric <- multipleGroup(dat, 1, group = group, invariance=c('slopes')) #equal slopes
#' # equal intercepts, free variance and means
#' mod_scalar2 <- multipleGroup(dat, 1, group = group,
#' invariance=c('slopes', 'intercepts', 'free_var','free_means'))
#' mod_scalar1 <- multipleGroup(dat, 1, group = group, #fixed means
#' invariance=c('slopes', 'intercepts', 'free_var'))
#' mod_fullconstrain <- multipleGroup(dat, 1, group = group,
#' invariance=c('slopes', 'intercepts'))
#' extract.mirt(mod_fullconstrain, 'time') #time of estimation components
#'
#' # optionally use Newton-Raphson for (generally) faster convergence in the
#' # M-step's, though occasionally less stable
#' mod_fullconstrain <- multipleGroup(dat, 1, group = group, optimizer = 'NR',
#' invariance=c('slopes', 'intercepts'))
#' extract.mirt(mod_fullconstrain, 'time') #time of estimation components
#'
#' summary(mod_scalar2)
#' coef(mod_scalar2, simplify=TRUE)
#' residuals(mod_scalar2)
#' plot(mod_configural)
#' plot(mod_configural, type = 'info')
#' plot(mod_configural, type = 'trace')
#' plot(mod_configural, type = 'trace', which.items = 1:4)
#' itemplot(mod_configural, 2)
#' itemplot(mod_configural, 2, type = 'RE')
#'
#' anova(mod_metric, mod_configural) #equal slopes only
#' anova(mod_scalar2, mod_metric) #equal intercepts, free variance and mean
#' anova(mod_scalar1, mod_scalar2) #fix mean
#' anova(mod_fullconstrain, mod_scalar1) #fix variance
#'
#' # compared all at once (in order of most constrained to least)
#' anova(mod_fullconstrain, mod_scalar2, mod_configural)
#'
#'
#' # test whether first 6 slopes should be equal across groups
#' values <- multipleGroup(dat, 1, group = group, pars = 'values')
#' values
#' constrain <- list(c(1, 63), c(5,67), c(9,71), c(13,75), c(17,79), c(21,83))
#' equalslopes <- multipleGroup(dat, 1, group = group, constrain = constrain)
#' anova(equalslopes, mod_configural)
#'
#' # same as above, but using mirt.model syntax
#' newmodel <- '
#' F = 1-15
#' CONSTRAINB = (1-6, a1)'
#' equalslopes <- multipleGroup(dat, newmodel, group = group)
#' coef(equalslopes, simplify=TRUE)
#'
#' ############
#' # vertical scaling (i.e., equating when groups answer items others do not)
#' dat2 <- dat
#' dat2[group == 'D1', 1:2] <- dat2[group != 'D1', 14:15] <- NA
#' head(dat2)
#' tail(dat2)
#'
#' # items with missing responses need to be constrained across groups for identification
#' nms <- colnames(dat2)
#' mod <- multipleGroup(dat2, 1, group, invariance = nms[c(1:2, 14:15)])
#'
#' # this will throw an error without proper constraints (SEs cannot be computed either)
#' # mod <- multipleGroup(dat2, 1, group)
#'
#' # model still does not have anchors, therefore need to add a few (here use items 3-5)
#' mod_anchor <- multipleGroup(dat2, 1, group,
#' invariance = c(nms[c(1:5, 14:15)], 'free_means', 'free_var'))
#' coef(mod_anchor, simplify=TRUE)
#'
#' # check if identified by computing information matrix
#' mod_anchor <- multipleGroup(dat2, 1, group, pars = mod2values(mod_anchor), TOL=NaN, SE=TRUE,
#' invariance = c(nms[c(1:5, 14:15)], 'free_means', 'free_var'))
#' mod_anchor
#' coef(mod_anchor)
#' coef(mod_anchor, printSE=TRUE)
#'
#'
#' #############
#' # DIF test for each item (using all other items as anchors)
#' itemnames <- colnames(dat)
#' refmodel <- multipleGroup(dat, 1, group = group, SE=TRUE,
#' invariance=c('free_means', 'free_var', itemnames))
#'
#' # loop over items (in practice, run in parallel to increase speed). May be better to use ?DIF
#' estmodels <- vector('list', ncol(dat))
#' for(i in 1:ncol(dat))
#' estmodels[[i]] <- multipleGroup(dat, 1, group = group, verbose = FALSE,
#' invariance=c('free_means', 'free_var', itemnames[-i]))
#' anova(refmodel, estmodels[[1]])
#' (anovas <- lapply(estmodels, function(x, refmodel) anova(refmodel, x),
#' refmodel=refmodel))
#'
#' # family-wise error control
#' p <- do.call(rbind, lapply(anovas, function(x) x[2, 'p']))
#' p.adjust(p, method = 'BH')
#'
#' # same as above, except only test if slopes vary (1 df)
#' # constrain all intercepts
#' estmodels <- vector('list', ncol(dat))
#' for(i in 1:ncol(dat))
#' estmodels[[i]] <- multipleGroup(dat, 1, group = group, verbose = FALSE,
#' invariance=c('free_means', 'free_var', 'intercepts',
#' itemnames[-i]))
#'
#' (anovas <- lapply(estmodels, function(x, refmodel) anova(refmodel, x),
#' refmodel=refmodel))
#'
#' # quickly test with Wald test using DIF()
#' mod_configural2 <- multipleGroup(dat, 1, group = group, SE=TRUE)
#' DIF(mod_configural2, which.par = c('a1', 'd'), Wald=TRUE, p.adjust = 'fdr')
#'
#'
#'
#' #############
#' # Three group model where the latent variable parameters are constrained to
#' # be equal in the focal groups
#'
#' set.seed(12345)
#' a <- matrix(abs(rnorm(15,1,.3)), ncol=1)
#' d <- matrix(rnorm(15,0,.7),ncol=1)
#' itemtype <- rep('2PL', nrow(a))
#' N <- 1000
#' dataset1 <- simdata(a, d, N, itemtype)
#' dataset2 <- simdata(a, d, N, itemtype, mu = .1, sigma = matrix(1.5))
#' dataset3 <- simdata(a, d, N, itemtype, mu = .1, sigma = matrix(1.5))
#' dat <- rbind(dataset1, dataset2, dataset3)
#' group <- rep(c('D1', 'D2', 'D3'), each=N)
#'
#' # marginal information
#' itemstats(dat)
#'
#' # conditional information
#' itemstats(dat, group=group)
#'
#' model <- 'F1 = 1-15
#' FREE[D2, D3] = (GROUP, MEAN_1), (GROUP, COV_11)
#' CONSTRAINB[D2,D3] = (GROUP, MEAN_1), (GROUP, COV_11)'
#'
#' mod <- multipleGroup(dat, model, group = group, invariance = colnames(dat))
#' coef(mod, simplify=TRUE)
#'
#'
#'
#' #############
#' # multiple factors
#'
#' a <- matrix(c(abs(rnorm(5,1,.3)), rep(0,15),abs(rnorm(5,1,.3)),
#' rep(0,15),abs(rnorm(5,1,.3))), 15, 3)
#' d <- matrix(rnorm(15,0,.7),ncol=1)
#' mu <- c(-.4, -.7, .1)
#' sigma <- matrix(c(1.21,.297,1.232,.297,.81,.252,1.232,.252,1.96),3,3)
#' itemtype <- rep('2PL', nrow(a))
#' N <- 1000
#' dataset1 <- simdata(a, d, N, itemtype)
#' dataset2 <- simdata(a, d, N, itemtype, mu = mu, sigma = sigma)
#' dat <- rbind(dataset1, dataset2)
#' group <- c(rep('D1', N), rep('D2', N))
#'
#' # group models
#' model <- '
#' F1 = 1-5
#' F2 = 6-10
#' F3 = 11-15'
#'
#' # define mirt cluster to use parallel architecture
#' if(interactive()) mirtCluster()
#'
#' # EM approach (not as accurate with 3 factors, but generally good for quick model comparisons)
#' mod_configural <- multipleGroup(dat, model, group = group) #completely separate analyses
#' mod_metric <- multipleGroup(dat, model, group = group, invariance=c('slopes')) #equal slopes
#' mod_fullconstrain <- multipleGroup(dat, model, group = group, #equal means, slopes, intercepts
#' invariance=c('slopes', 'intercepts'))
#'
#' anova(mod_metric, mod_configural)
#' anova(mod_fullconstrain, mod_metric)
#'
#' # same as above, but with MHRM (generally more accurate with 3+ factors, but slower)
#' mod_configural <- multipleGroup(dat, model, group = group, method = 'MHRM')
#' mod_metric <- multipleGroup(dat, model, group = group, invariance=c('slopes'), method = 'MHRM')
#' mod_fullconstrain <- multipleGroup(dat, model, group = group, method = 'MHRM',
#' invariance=c('slopes', 'intercepts'))
#'
#' anova(mod_metric, mod_configural)
#' anova(mod_fullconstrain, mod_metric)
#'
#' ############
#' # polytomous item example
#' set.seed(12345)
#' a <- matrix(abs(rnorm(15,1,.3)), ncol=1)
#' d <- matrix(rnorm(15,0,.7),ncol=1)
#' d <- cbind(d, d-1, d-2)
#' itemtype <- rep('graded', nrow(a))
#' N <- 1000
#' dataset1 <- simdata(a, d, N, itemtype)
#' dataset2 <- simdata(a, d, N, itemtype, mu = .1, sigma = matrix(1.5))
#' dat <- rbind(dataset1, dataset2)
#' group <- c(rep('D1', N), rep('D2', N))
#' model <- 'F1 = 1-15'
#'
#' mod_configural <- multipleGroup(dat, model, group = group)
#' plot(mod_configural)
#' plot(mod_configural, type = 'SE')
#' itemplot(mod_configural, 1)
#' itemplot(mod_configural, 1, type = 'info')
#' plot(mod_configural, type = 'trace') # messy, score function typically better
#' plot(mod_configural, type = 'itemscore')
#'
#' fs <- fscores(mod_configural, full.scores = FALSE)
#' head(fs[["D1"]])
#' fscores(mod_configural, method = 'EAPsum', full.scores = FALSE)
#'
#' # constrain slopes within each group to be equal (but not across groups)
#' model2 <- 'F1 = 1-15
#' CONSTRAIN = (1-15, a1)'
#' mod_configural2 <- multipleGroup(dat, model2, group = group)
#' plot(mod_configural2, type = 'SE')
#' plot(mod_configural2, type = 'RE')
#' itemplot(mod_configural2, 10)
#'
#' ############
#' ## empirical histogram example (normal and bimodal groups)
#' set.seed(1234)
#' a <- matrix(rlnorm(50, .2, .2))
#' d <- matrix(rnorm(50))
#' ThetaNormal <- matrix(rnorm(2000))
#' ThetaBimodal <- scale(matrix(c(rnorm(1000, -2), rnorm(1000,2)))) #bimodal
#' Theta <- rbind(ThetaNormal, ThetaBimodal)
#' dat <- simdata(a, d, 4000, itemtype = '2PL', Theta=Theta)
#' group <- rep(c('G1', 'G2'), each=2000)
#'
#' EH <- multipleGroup(dat, 1, group=group, dentype="empiricalhist", invariance = colnames(dat))
#' coef(EH, simplify=TRUE)
#' plot(EH, type = 'empiricalhist', npts = 60)
#'
#' # DIF test for item 1
#' EH1 <- multipleGroup(dat, 1, group=group, dentype="empiricalhist", invariance = colnames(dat)[-1])
#' anova(EH, EH1)
#'
#' #--------------------------------
#' # Mixture model (no prior group variable specified)
#'
#' set.seed(12345)
#' nitems <- 20
#' a1 <- matrix(.75, ncol=1, nrow=nitems)
#' a2 <- matrix(1.25, ncol=1, nrow=nitems)
#' d1 <- matrix(rnorm(nitems,0,1),ncol=1)
#' d2 <- matrix(rnorm(nitems,0,1),ncol=1)
#' itemtype <- rep('2PL', nrow(a1))
#' N1 <- 500
#' N2 <- N1*2 # second class twice as large
#'
#' dataset1 <- simdata(a1, d1, N1, itemtype)
#' dataset2 <- simdata(a2, d2, N2, itemtype)
#' dat <- rbind(dataset1, dataset2)
#' # group <- c(rep('D1', N1), rep('D2', N2))
#'
#' # Mixture Rasch model (Rost, 1990)
#' models <- 'F1 = 1-20
#' CONSTRAIN = (1-20, a1)'
#' mod_mix <- multipleGroup(dat, models, dentype = 'mixture-2', GenRandomPars = TRUE)
#' coef(mod_mix, simplify=TRUE)
#' summary(mod_mix)
#' plot(mod_mix)
#' plot(mod_mix, type = 'trace')
#' itemplot(mod_mix, 1, type = 'info')
#'
#' head(fscores(mod_mix)) # theta estimates
#' head(fscores(mod_mix, method = 'classify')) # classification probability
#' itemfit(mod_mix)
#'
#' # Mixture 2PL model
#' mod_mix2 <- multipleGroup(dat, 1, dentype = 'mixture-2', GenRandomPars = TRUE)
#' anova(mod_mix, mod_mix2)
#' coef(mod_mix2, simplify=TRUE)
#' itemfit(mod_mix2)
#'
#' # Compare to single group
#' mod <- mirt(dat)
#' anova(mod, mod_mix2)
#'
#' ########################################
#' # Zero-inflated 2PL IRT model
#'
#' n <- 1000
#' nitems <- 20
#'
#' a <- rep(2, nitems)
#' d <- rep(c(-2,-1,0,1,2), each=nitems/5)
#' zi_p <- 0.2 # Proportion of people in zero class
#'
#' theta <- rnorm(n, 0, 1)
#' zeros <- matrix(0, n*zi_p, nitems)
#' nonzeros <- simdata(a, d, n*(1-zi_p), itemtype = '2PL',
#' Theta = as.matrix(theta[1:(n*(1-zi_p))]))
#' data <- rbind(nonzeros, zeros)
#'
#' # define class with extreme theta but fixed item parameters
#' zi2PL <- "F = 1-20
#' START [MIXTURE_1] = (GROUP, MEAN_1, -100), (GROUP, COV_11, .00001),
#' (1-20, a1, 1.0), (1-20, d, 0)
#' FIXED [MIXTURE_1] = (GROUP, MEAN_1), (GROUP, COV_11),
#' (1-20, a1), (1-20, d)"
#'
#' # define custom Theta integration grid that contains extreme theta + normal grid
#' technical <- list(customTheta = matrix(c(-100, seq(-6,6,length.out=61))))
#'
#' # fit ZIM-IRT
#' zi2PL.fit <- multipleGroup(data, zi2PL, dentype = 'mixture-2', technical=technical)
#' coef(zi2PL.fit, simplify=TRUE)
#'
#' # classification estimates
#' pi_hat <- fscores(zi2PL.fit, method = 'classify')
#' head(pi_hat)
#' tail(pi_hat)
#'
#' # EAP estimates (not useful for zip class)
#' fs <- fscores(zi2PL.fit)
#' head(fs)
#' tail(fs)
#'
#' }
multipleGroup <- function(data, model = 1, group, itemtype = NULL,
invariance = '', method = 'EM',
dentype = 'Gaussian', ...)
{
Call <- match.call()
dots <- list(...)
if(is.character(model)) model <- mirt.model(model)
if(!is.null(dots$formula))
stop('latent regression models not supported for multiple group yet', call.=FALSE) #TODO
invariance[invariance %in% c("free_mean")] <- 'free_means'
invariance[invariance %in% c("free_vars", 'free_variance', 'free_variances')] <- 'free_var'
constrain <- dots$constrain
invariance.check <- sum(invariance %in% c('free_means', 'free_var'))
if(any(invariance == 'intercepts')) invariance.check <- invariance.check - 1L
if(any(invariance == 'slopes')) invariance.check <- invariance.check - 1L
if(any(invariance %in% colnames(data)))
invariance.check <- invariance.check - 2L
if(!is.null(dots$dentype))
if(dots$dentype == "empiricalhist" && any(invariance.check))
stop('freeing group parameters not meaningful when estimating empirical histograms',
call.=FALSE)
if(invariance.check > 0L && length(constrain) == 0){
warn <- TRUE
if(is(model, 'mirt.model')){
if(any(model$x[,1L] == 'CONSTRAINB'))
warn <- FALSE
}
if(warn)
stop('Model is not identified without further constraints (may require additional
anchoring items).', call.=FALSE)
}
if(grepl('mixture', dentype)) group <- rep('full', nrow(data))
mod <- ESTIMATION(data=data, model=model, group=group, invariance=invariance, method=method,
itemtype=itemtype, dentype=dentype, ...)
if(is(mod, 'MultipleGroupClass') || is(mod, 'MixtureClass'))
mod@Call <- Call
return(mod)
}