-
-
Notifications
You must be signed in to change notification settings - Fork 11
/
mvgam.R
2101 lines (1925 loc) 路 94.5 KB
/
mvgam.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#'Fit a Bayesian dynamic GAM to a univariate or multivariate set of time series
#'
#'This function estimates the posterior distribution for Generalised Additive Models (GAMs) that can include
#'smooth spline functions, specified in the GAM formula, as well as latent temporal processes,
#'specified by `trend_model`. Further modelling options include State-Space representations to allow covariates
#'and dynamic processes to occur on the latent 'State' level while also capturing observation-level effects.
#'Prior specifications are flexible and explicitly encourage users to apply
#'prior distributions that actually reflect their beliefs. In addition, model fits can easily be assessed and
#'compared with posterior predictive checks, forecast comparisons and leave-one-out / leave-future-out cross-validation.
#'
#'@importFrom parallel clusterExport stopCluster setDefaultCluster
#'@importFrom stats formula terms rnorm update.formula predict
#'@importFrom rlang missing_arg
#'@param formula A \code{character} string specifying the GAM observation model formula. These are exactly like the formula
#'for a GLM except that smooth terms, `s()`, `te()`, `ti()`, `t2()`, as well as time-varying
#'`dynamic()` terms, can be added to the right hand side
#'to specify that the linear predictor depends on smooth functions of predictors
#'(or linear functionals of these). In `nmix()` family models, the `formula` is used to
#'set up a linear predictor for the detection probability. Details of the formula syntax used by \pkg{mvgam}
#'can be found in \code{\link{mvgam_formulae}}
#'@param trend_formula An optional \code{character} string specifying the GAM process model formula. If
#'supplied, a linear predictor will be modelled for the latent trends to capture process model evolution
#'separately from the observation model. Should not have a response variable specified on the left-hand side
#'of the formula (i.e. a valid option would be `~ season + s(year)`). Also note that you should not use
#'the identifier `series` in this formula to specify effects that vary across time series. Instead you should use
#'`trend`. This will ensure that models in which a `trend_map` is supplied will still work consistently
#'(i.e. by allowing effects to vary across process models, even when some time series share the same underlying
#'process model). This feature is only currently available for `RW()`, `AR()` and `VAR()` trend models.
#'In `nmix()` family models, the `trend_formula` is used to set up a linear predictor for the underlying
#'latent abundance
#'@param knots An optional \code{list} containing user specified knot values to be used for basis construction.
#'For most bases the user simply supplies the knots to be used, which must match up with the k value supplied
#'(note that the number of knots is not always just `k`). Different terms can use different numbers of knots,
#'unless they share a covariate
#'@param trend_knots As for `knots` above, this is an optional \code{list} of knot values for smooth
#'functions within the `trend_formula`
#'@param data A \code{dataframe} or \code{list} containing the model response variable and covariates
#'required by the GAM \code{formula} and optional \code{trend_formula}. Should include columns:
#'#'\itemize{
#' \item`series` (a \code{factor} index of the series IDs; the number of levels should be identical
#' to the number of unique series labels (i.e. `n_series = length(levels(data$series))`))
#' \item`time` (\code{numeric} or \code{integer} index of the time point for each observation).
#' For most dynamic trend types available in `mvgam` (see argument `trend_model`), time should be
#' measured in discrete, regularly spaced intervals (i.e. `c(1, 2, 3, ...)`). However you can
#' use irregularly spaced intervals if using `trend_model = CAR(1)`, though note that any
#'temporal intervals that are exactly `0` will be adjusted to a very small number
#'(`1e-12`) to prevent sampling errors. See an example of `CAR()` trends in \code{\link{CAR}}
#' }
#'Should also include any other variables to be included in the linear predictor of \code{formula}
#'@param data_train Deprecated. Still works in place of \code{data} but users are recommended to use
#'\code{data} instead for more seamless integration into `R` workflows
#'@param newdata Optional \code{dataframe} or \code{list} of test data containing at least `series` and `time`
#'in addition to any other variables included in the linear predictor of \code{formula}. If included, the
#'observations in variable \code{y} will be set to \code{NA} when fitting the model so that posterior
#'simulations can be obtained
#'@param data_test Deprecated. Still works in place of \code{newdata} but users are recommended to use
#'\code{newdata} instead for more seamless integration into `R` workflows
#'@param run_model \code{logical}. If \code{FALSE}, the model is not fitted but instead the function will
#'return the model file and the data / initial values that are needed to fit the model outside of \code{mvgam}
#'@param prior_simulation \code{logical}. If \code{TRUE}, no observations are fed to the model, and instead
#'simulations from prior distributions are returned
#'@param return_model_data \code{logical}. If \code{TRUE}, the list of data that is needed to fit the
#'model is returned, along with the initial values for smooth and AR parameters, once the model is fitted.
#'This will be helpful if users wish to modify the model file to add
#'other stochastic elements that are not currently available in \code{mvgam}.
#'Default is \code{FALSE} to reduce
#'the size of the returned object, unless \code{run_model == FALSE}
#'@param family \code{family} specifying the exponential observation family for the series. Currently supported
#'families are:
#'\itemize{
#' \item`gaussian()` for real-valued data
#' \item`betar()` for proportional data on `(0,1)`
#' \item`lognormal()` for non-negative real-valued data
#' \item`student_t()` for real-valued data
#' \item`Gamma()` for non-negative real-valued data
#' \item`bernoulli()` for binary data
#' \item`poisson()` for count data
#' \item`nb()` for overdispersed count data
#' \item`binomial()` for count data with imperfect detection when the number of trials is known;
#' note that the `cbind()` function must be used to bind the discrete observations and the discrete number
#' of trials
#' \item`beta_binomial()` as for `binomial()` but allows for overdispersion
#' \item`nmix()` for count data with imperfect detection when the number of trials
#' is unknown and should be modeled via a State-Space N-Mixture model.
#' The latent states are Poisson, capturing the 'true' latent
#' abundance, while the observation process is Binomial to account for
#' imperfect detection.
#' See \code{\link{mvgam_families}} for an example of how to use this family}
#'Note that only `nb()` and `poisson()` are available if using `JAGS` as the backend.
#'Default is `poisson()`.
#'See \code{\link{mvgam_families}} for more details
#'@param share_obs_params \code{logical}. If \code{TRUE} and the \code{family}
#'has additional family-specific observation parameters (e.g. variance components in
#'`student_t()` or `gaussian()`, or dispersion parameters in `nb()` or `betar()`),
#'these parameters will be shared across all series. This is handy if you have multiple
#'time series that you believe share some properties, such as being from the same
#'species over different spatial units. Default is \code{FALSE}.
#'@param use_lv \code{logical}. If \code{TRUE}, use dynamic factors to estimate series'
#'latent trends in a reduced dimension format. Only available for
#'`RW()`, `AR()` and `GP()` trend models. Defaults to \code{FALSE}
#'@param n_lv \code{integer} the number of latent dynamic factors to use if \code{use_lv == TRUE}.
#'Cannot be \code{> n_series}. Defaults arbitrarily to \code{min(2, floor(n_series / 2))}
#'@param trend_model \code{character} or \code{function} specifying the time series dynamics for the latent trend. Options are:
#'\itemize{
#' \item `None` (no latent trend component; i.e. the GAM component is all that contributes to the linear predictor,
#'and the observation process is the only source of error; similarly to what is estimated by \code{\link[mgcv]{gam}})
#' \item `'RW'` or `RW()`
#' \item `'AR1'` or `AR(p = 1)`
#' \item `'AR2'` or `AR(p = 2)`
#' \item `'AR3'` or `AR(p = 3)`
#' \item `'CAR1'` or `CAR(p = 1)`
#' \item `'VAR1'` or `VAR()`(only available in \code{Stan})
#' \item `'PWlogistic`, `'PWlinear'` or `PW()` (only available in \code{Stan})
#' \item `'GP'` or `GP()` (Gaussian Process with squared exponential kernel;
#'only available in \code{Stan})}
#'
#'For all trend types apart from `GP()`, `CAR()` and `PW()`, moving average and/or correlated
#'process error terms can also be estimated (for example, `RW(cor = TRUE)` will set up a
#'multivariate Random Walk if `n_series > 1`). See [mvgam_trends] for more details
#'@param trend_map Optional `data.frame` specifying which series should depend on which latent
#'trends. Useful for allowing multiple series to depend on the same latent trend process, but with
#'different observation processes. If supplied, a latent factor model is set up by setting
#'`use_lv = TRUE` and using the mapping to set up the shared trends. Needs to have column names
#'`series` and `trend`, with integer values in the `trend` column to state which trend each series
#'should depend on. The `series` column should have a single unique entry for each series in the
#'data (names should perfectly match factor levels of the `series` variable in `data`). See examples
#'for details
#'@param drift \code{logical} estimate a drift parameter in the latent trend components. Useful if the latent
#'trend is expected to broadly follow a non-zero slope. Only available for
#'`RW()` and `AR()` trend models. Note that if the latent trend is more or less stationary,
#'the drift parameter can become unidentifiable, especially if an intercept term is included in the GAM linear
#'predictor (which it is by default when calling \code{\link[mgcv]{jagam}}). Drift parameters will also likely
#'be unidentifiable if using dynamic factor models. Therefore this defaults to \code{FALSE}
#'@param noncentred \code{logical} experimental feature to use the non-centred parameterisation for autoregressive
#'dynamic trend models. Only available if there is no `trend_formula` and for certain trend models
#'(i.e. `RW`, `AR1`, `AR2`, `AR3`, or `CAR1` with or without `drift`). Not available for moving average or
#'correlated error models
#'@param chains \code{integer} specifying the number of parallel chains for the model. Ignored
#'if `algorithm %in% c('meanfield', 'fullrank', 'pathfinder', 'laplace')`
#'@param burnin \code{integer} specifying the number of warmup iterations of the Markov chain to run
#'to tune sampling algorithms. Ignored
#'if `algorithm %in% c('meanfield', 'fullrank', 'pathfinder', 'laplace')`
#'@param samples \code{integer} specifying the number of post-warmup iterations of the Markov chain to run for
#'sampling the posterior distribution
#'@param thin Thinning interval for monitors. Ignored
#'if `algorithm %in% c('meanfield', 'fullrank', 'pathfinder', 'laplace')`
#'@param parallel \code{logical} specifying whether multiple cores should be used for
#'generating MCMC simulations in parallel. If \code{TRUE}, the number of cores to use will be
#'\code{min(c(chains, parallel::detectCores() - 1))}
#'@param threads \code{integer} Experimental option to use multithreading for within-chain
#'parallelisation in \code{Stan}. We recommend its use only if you are experienced with
#'\code{Stan}'s `reduce_sum` function and have a slow running model that cannot be sped
#'up by any other means. Only available for some families(`poisson()`, `nb()`, `gaussian()`) and
#'when using \code{Cmdstan} as the backend
#'@param priors An optional \code{data.frame} with prior
#'definitions (in JAGS or Stan syntax). if using Stan, this can also be an object of
#'class `brmsprior` (see. \code{\link[brms]{prior}} for details). See [get_mvgam_priors] and
#''Details' for more information on changing default prior distributions
#'@param refit Logical indicating whether this is a refit, called using [update.mvgam]. Users should leave
#'as `FALSE`
#'@param lfo Logical indicating whether this is part of a call to [lfo_cv.mvgam]. Returns a
#'lighter version of the model with no residuals and fewer monitored parameters to speed up
#'post-processing. But other downstream functions will not work properly, so users should always
#'leave this set as `FALSE`
#'@param residuals Logical indicating whether to compute series-level randomized quantile residuals and include
#'them as part of the returned object. Defaults to `TRUE`, but you can set to `FALSE` to save
#'computational time and reduce the size of the returned object (users can always add residuals to
#'an object of class `mvgam` using [add_residuals])
#'@param use_stan Logical. If \code{TRUE}, the model will be compiled and sampled using
#'Hamiltonian Monte Carlo with a call to \code{\link[cmdstanr]{cmdstan_model}} or
#'a call to \code{\link[rstan]{stan}}. Note that
#'there are many more options when using `Stan` vs `JAGS`
#'@param backend Character string naming the package to use as the backend for fitting
#'the Stan model (if `use_stan = TRUE`). Options are "cmdstanr" (the default) or "rstan". Can be set globally
#'for the current R session via the \code{"brms.backend"} option (see \code{\link{options}}). Details on
#'the rstan and cmdstanr packages are available at https://mc-stan.org/rstan/ and
#'https://mc-stan.org/cmdstanr/, respectively
#'@param algorithm Character string naming the estimation approach to use.
#' Options are \code{"sampling"} for MCMC (the default), \code{"meanfield"} for
#' variational inference with factorized normal distributions,
#' \code{"fullrank"} for variational inference with a multivariate normal
#' distribution, \code{"laplace"} for a Laplace approximation (only available
#' when using cmdstanr as the backend) or \code{"pathfinder"} for the pathfinder
#' algorithm (only currently available when using cmdstanr as the backend).
#' Can be set globally for the current \R session via the
#' \code{"brms.algorithm"} option (see \code{\link{options}}). Limited testing
#' suggests that `"meanfield"` performs best out of the non-MCMC approximations for
#' dynamic GAMs, possibly because of the difficulties estimating covariances among the
#' many spline parameters and latent trend parameters. But rigorous testing has not
#' been carried out
#'@param autoformat \code{Logical}. Use the `stanc` parser to automatically format the
#'`Stan` code and check for deprecations. Defaults to `TRUE`
#' @param save_all_pars \code{Logical} flag to indicate if draws from all
#' variables defined in Stan's \code{parameters} block should be saved
#' (default is \code{FALSE}).
#'@param max_treedepth positive integer placing a cap on the number of simulation steps evaluated during each iteration when
#'`use_stan == TRUE`. Default is `12`. Increasing this value can sometimes help with exploration of complex
#'posterior geometries, but it is rarely fruitful to go above a `max_treedepth` of `14`
#'@param adapt_delta positive numeric between `0` and `1` defining the target average proposal acceptance probability
#'during Stan's adaptation period, if `use_stan == TRUE`. Default is `0.8`. In general you should not need to change adapt_delta
#'unless you see a warning message about divergent transitions, in which case you can increase adapt_delta from the default
#'to a value closer to `1` (e.g. from `0.95` to `0.99`, or from `0.99` to `0.999`, etc).
#'The step size used by the numerical integrator is a function of `adapt_delta` in that increasing
#'`adapt_delta` will result in a smaller step size and fewer divergences. Increasing `adapt_delta` will
#'typically result in a slower sampler, but it will always lead to a more robust sampler
#'@param silent Verbosity level between `0` and `2`. If `1` (the default), most of the informational
#'messages of compiler and sampler are suppressed. If `2`, even more messages are suppressed. The
#'actual sampling progress is still printed. Set `refresh = 0` to turn this off as well. If using
#'`backend = "rstan"` you can also set open_progress = FALSE to prevent opening additional
#'progress bars.
#'@param jags_path Optional character vector specifying the path to the location of the `JAGS` executable (.exe) to use
#'for modelling if `use_stan == FALSE`. If missing, the path will be recovered from a call to \code{\link[runjags]{findjags}}
#'@param ... Further arguments passed to Stan.
#'For \code{backend = "rstan"} the arguments are passed to
#'\code{\link[rstan]{sampling}} or \code{\link[rstan]{vb}}.
#'For \code{backend = "cmdstanr"} the arguments are passed to the
#'\code{cmdstanr::sample}, \code{cmdstanr::variational},
#'\code{cmdstanr::laplace} or
#'\code{cmdstanr::pathfinder} method
#'@details Dynamic GAMs are useful when we wish to predict future values from time series that show temporal dependence
#'but we do not want to rely on extrapolating from a smooth term (which can sometimes lead to unpredictable and unrealistic behaviours).
#'In addition, smooths can often try to wiggle excessively to capture any autocorrelation that is present in a time series,
#'which exacerbates the problem of forecasting ahead. As GAMs are very naturally viewed through a Bayesian lens, and we often
#'must model time series that show complex distributional features and missing data, parameters for `mvgam` models are estimated
#'in a Bayesian framework using Markov Chain Monte Carlo by default. A general overview is provided
#'in the primary vignettes: `vignette("mvgam_overview")` and `vignette("data_in_mvgam")`.
#'For a full list of available vignettes see `vignette(package = "mvgam")`
#'\cr
#'\cr
#'*Formula syntax*: Details of the formula syntax used by \pkg{mvgam} can be found in
#'\code{\link{mvgam_formulae}}. Note that it is possible to supply an empty formula where
#'there are no predictors or intercepts in the observation model (i.e. `y ~ 0` or `y ~ -1`).
#'In this case, an intercept-only observation model will be set up but the intercept coefficient
#'will be fixed at zero. This can be handy if you wish to fit pure State-Space models where
#'the variation in the dynamic trend controls the average expectation, and/or where intercepts
#'are non-identifiable (as in piecewise trends, see examples below)
#'\cr
#'\cr
#'*Families and link functions*: Details of families supported by \pkg{mvgam} can be found in
#'\code{\link{mvgam_families}}.
#'\cr
#'\cr
#'*Trend models*: Details of latent trend dynamic models supported by \pkg{mvgam} can be found in
#'\code{\link{mvgam_trends}}.
#'\cr
#'\cr
#'*Priors*: A \code{\link[mgcv]{jagam}} model file is generated from \code{formula} and
#'modified to include any latent
#'temporal processes. Default priors for intercepts and any scale parameters are generated
#'using the same practice as \pkg{brms}. Prior distributions for most important model parameters can be
#'altered by the user to inspect model
#'sensitivities to given priors (see \code{\link{get_mvgam_priors}} for details).
#'Note that latent trends are estimated on the link scale so choose priors
#'accordingly. However more control over the model specification can be accomplished by first using `mvgam` as a
#'baseline, then editing the returned model accordingly. The model file can be edited and run outside
#'of `mvgam` by setting \code{run_model = FALSE} and this is encouraged for complex
#'modelling tasks. Note, no priors are
#'formally checked to ensure they are in the right syntax for the respective
#'probabilistic modelling framework, so it is
#'up to the user to ensure these are correct (i.e. use `dnorm` for normal
#'densities in `JAGS`, with the mean and precision
#'parameterisation; but use `normal` for normal densities in `Stan`, with the mean
#'and standard deviation parameterisation)
#'\cr
#'\cr
#'*Random effects*: For any smooth terms using the random effect basis (\code{\link[mgcv]{smooth.construct.re.smooth.spec}}),
#'a non-centred parameterisation is automatically employed to avoid degeneracies that are common in hierarchical models.
#'Note however that centred versions may perform better for series that are particularly informative, so as with any
#'foray into Bayesian modelling, it is worth building an understanding of the model's assumptions and limitations by following a
#'principled workflow. Also note that models are parameterised using `drop.unused.levels = FALSE` in \code{\link[mgcv]{jagam}}
#'to ensure predictions can be made for all levels of the supplied factor variable
#'\cr
#'\cr
#'*Observation level parameters*: When more than one series is included in \code{data} and an
#'observation family that contains more than one parameter is used, additional observation family parameters
#'(i.e. `phi` for `nb()` or `sigma` for `gaussian()`) are
#'by default estimated independently for each series. But if you wish for the series to share
#'the same observation parameters, set `share_obs_params = TRUE`
#'\cr
#'\cr
#'*Factor regularisation*: When using a dynamic factor model for the trends with `JAGS` factor precisions are given
#'regularized penalty priors to theoretically allow some factors to be dropped from the model by squeezing increasing
#'factors' variances to zero. This is done to help protect against selecting too many latent factors than are needed to
#'capture dependencies in the data, so it can often be advantageous to set `n_lv` to a slightly larger number. However
#'larger numbers of factors do come with additional computational costs so these should be balanced as well. When using
#'`Stan`, all factors are parameterised with fixed variance parameters
#'\cr
#'\cr
#'*Residuals*: For each series, randomized quantile (i.e. Dunn-Smyth) residuals are calculated for inspecting model diagnostics
#'If the fitted model is appropriate then Dunn-Smyth residuals will be standard normal in distribution and no
#'autocorrelation will be evident. When a particular observation is missing, the residual is calculated by comparing independent
#'draws from the model's posterior distribution
#'\cr
#'\cr
#'*Using Stan*: `mvgam` is primarily designed to use Hamiltonian Monte Carlo for parameter estimation
#'via the software `Stan` (using either the `cmdstanr` or `rstan` interface).
#'There are great advantages when using `Stan` over Gibbs / Metropolis Hastings samplers, which includes the option
#'to estimate smooth latent trends via [Hilbert space approximate Gaussian Processes](https://arxiv.org/abs/2004.11408).
#'This often makes sense for ecological series, which we expect to change smoothly. In `mvgam`, latent squared
#'exponential GP trends are approximated using by default \code{20} basis functions, which saves computational
#'costs compared to fitting full GPs while adequately estimating
#'GP \code{alpha} and \code{rho} parameters. Because of the many advantages of `Stan` over `JAGS`,
#'*further development of the package will only be applied to `Stan`*. This includes the planned addition
#'of more response distributions, plans to handle zero-inflation, and plans to incorporate a greater
#'variety of trend models. Users are strongly encouraged to opt for `Stan` over `JAGS` in any proceeding workflows
#'@author Nicholas J Clark
#'@references Nicholas J Clark & Konstans Wells (2020). Dynamic generalised additive models (DGAMs) for forecasting discrete ecological time series.
#'Methods in Ecology and Evolution. 14:3, 771-784.
#'@seealso \code{\link[mgcv]{jagam}}, \code{\link[mgcv]{gam}}, \code{\link[mgcv]{gam.models}},
#'@return A \code{list} object of class \code{mvgam} containing model output, the text representation of the model file,
#'the mgcv model output (for easily generating simulations at
#'unsampled covariate values), Dunn-Smyth residuals for each series and key information needed
#'for other functions in the package. See \code{\link{mvgam-class}} for details.
#'Use `methods(class = "mvgam")` for an overview on available methods.
#'
#'@examples
#'\donttest{
#'# Simulate a collection of three time series that have shared seasonal dynamics
#'# and independent AR1 trends, with a Poisson observation process
#'set.seed(0)
#'dat <- sim_mvgam(T = 80,
#' n_series = 3,
#' mu = 2,
#' trend_model = AR(p = 1),
#' prop_missing = 0.1,
#' prop_trend = 0.6)
#'
#'# Plot key summary statistics for a single series
#'plot_mvgam_series(data = dat$data_train, series = 1)
#'
#'# Plot all series together
#'plot_mvgam_series(data = dat$data_train, series = 'all')
#'
#'# Formulate a model using Stan where series share a cyclic smooth for
#'# seasonality and each series has an independent AR1 temporal process;
#'# Set run_model = FALSE to inspect the returned objects
#'mod1 <- mvgam(formula = y ~ s(season, bs = 'cc', k = 6),
#' data = dat$data_train,
#' trend_model = AR(),
#' family = poisson(),
#' use_stan = TRUE,
#' run_model = FALSE)
#'
#' # View the model code in Stan language
#' code(mod1)
#'
#' # Now fit the model, noting that 'noncentred = TRUE' will likely give performance gains
#' mod1 <- mvgam(formula = y ~ s(season, bs = 'cc', k = 6),
#' data = dat$data_train,
#' trend_model = AR(),
#' family = poisson(),
#' noncentred = TRUE,
#' chains = 2)
#'
#' # Extract the model summary
#' summary(mod1)
#'
#' # Plot the estimated historical trend and forecast for one series
#' plot(mod1, type = 'trend', series = 1)
#' plot(mod1, type = 'forecast', series = 1)
#'
#' # Residual diagnostics
#' plot(mod1, type = 'residuals', series = 1)
#' resids <- residuals(mod1)
#' str(resids)
#'
#' # Compute the forecast using covariate information in data_test
#' fc <- forecast(mod1, newdata = dat$data_test)
#' str(fc)
#' plot(fc)
#'
#' # Plot the estimated seasonal smooth function
#' plot(mod1, type = 'smooths')
#'
#' # Plot estimated first derivatives of the smooth
#' plot(mod1, type = 'smooths', derivatives = TRUE)
#'
#' # Plot partial residuals of the smooth
#' plot(mod1, type = 'smooths', residuals = TRUE)
#'
#' # Plot posterior realisations for the smooth
#' plot(mod1, type = 'smooths', realisations = TRUE)
#'
#' # Plot conditional response predictions using marginaleffects
#' conditional_effects(mod1)
#' plot_predictions(mod1, condition = 'season', points = 0.5)
#'
#' # Generate posterior predictive checks using bayesplot
#' pp_check(mod1)
#'
#' # Extract observation model beta coefficient draws as a data.frame
#' beta_draws_df <- as.data.frame(mod1, variable = 'betas')
#' head(beta_draws_df)
#' str(beta_draws_df)
#'
#' # Investigate model fit
#' mc.cores.def <- getOption('mc.cores')
#' options(mc.cores = 1)
#' loo(mod1)
#' options(mc.cores = mc.cores.def)
#'
#'
#' # Example of supplying a trend_map so that some series can share
#' # latent trend processes
#' sim <- sim_mvgam(n_series = 3)
#' mod_data <- sim$data_train
#'
#' # Here, we specify only two latent trends; series 1 and 2 share a trend,
#' # while series 3 has it's own unique latent trend
#' trend_map <- data.frame(series = unique(mod_data$series),
#' trend = c(1, 1, 2))
#'
#' # Fit the model using AR1 trends
#' mod <- mvgam(y ~ s(season, bs = 'cc', k = 6),
#' trend_map = trend_map,
#' trend_model = AR(),
#' data = mod_data,
#' return_model_data = TRUE,
#' chains = 2)
#'
#' # The mapping matrix is now supplied as data to the model in the 'Z' element
#' mod1$model_data$Z
#' code(mod)
#'
#' # The first two series share an identical latent trend; the third is different
#' plot(mod, type = 'trend', series = 1)
#' plot(mod, type = 'trend', series = 2)
#' plot(mod, type = 'trend', series = 3)
#'
#'
#' # Example of how to use dynamic coefficients
#' # Simulate a time-varying coefficient for the effect of temperature
#' set.seed(123)
#' N <- 200
#' beta_temp <- vector(length = N)
#' beta_temp[1] <- 0.4
#' for(i in 2:N){
#' beta_temp[i] <- rnorm(1, mean = beta_temp[i - 1] - 0.0025, sd = 0.05)
#' }
#' plot(beta_temp)
#'
#' # Simulate a covariate called 'temp'
#' temp <- rnorm(N, sd = 1)
#'
#' # Simulate the Gaussian observation process
#' out <- rnorm(N, mean = 4 + beta_temp * temp,
#' sd = 0.5)
#'
#' # Gather necessary data into a data.frame; split into training / testing
#' data = data.frame(out, temp, time = seq_along(temp))
#' data_train <- data[1:180,]
#' data_test <- data[181:200,]
#'
#' # Fit the model using the dynamic() formula helper
#' mod <- mvgam(out ~
#' dynamic(temp,
#' scale = FALSE,
#' k = 40),
#' family = gaussian(),
#' data = data_train,
#' newdata = data_test,
#' chains = 2)
#'
#' # Inspect the model summary, forecast and time-varying coefficient distribution
#' summary(mod)
#' plot(mod, type = 'smooths')
#' fc <- forecast(mod, newdata = data_test)
#' plot(fc)
#'
#' # Propagating the smooth term shows how the coefficient is expected to evolve
#' plot_mvgam_smooth(mod, smooth = 1, newdata = data)
#' abline(v = 180, lty = 'dashed', lwd = 2)
#' points(beta_temp, pch = 16)
#'
#'
#' # Example showing how to incorporate an offset; simulate some count data
#' # with different means per series
#' set.seed(100)
#' dat <- sim_mvgam(prop_trend = 0, mu = c(0, 2, 2), seasonality = 'hierarchical')
#'
#' # Add offset terms to the training and testing data
#' dat$data_train$offset <- 0.5 * as.numeric(dat$data_train$series)
#' dat$data_test$offset <- 0.5 * as.numeric(dat$data_test$series)
#'
#' # Fit a model that includes the offset in the linear predictor as well as
#' # hierarchical seasonal smooths
#' mod <- mvgam(formula = y ~ offset(offset) +
#' s(series, bs = 're') +
#' s(season, bs = 'cc') +
#' s(season, by = series, m = 1, k = 5),
#' data = dat$data_train,
#' chains = 2)
#'
#' # Inspect the model file to see the modification to the linear predictor
#' # (eta)
#' code(mod)
#'
#' # Forecasts for the first two series will differ in magnitude
#' fc <- forecast(mod, newdata = dat$data_test)
#' layout(matrix(1:2, ncol = 2))
#' plot(fc, series = 1, ylim = c(0, 75))
#' plot(fc, series = 2, ylim = c(0, 75))
#' layout(1)
#'
#' # Changing the offset for the testing data should lead to changes in
#' # the forecast
#' dat$data_test$offset <- dat$data_test$offset - 2
#' fc <- forecast(mod, newdata = dat$data_test)
#' plot(fc)
#'
#' # Relative Risks can be computed by fixing the offset to the same value
#' # for each series
#' dat$data_test$offset <- rep(1, NROW(dat$data_test))
#' preds_rr <- predict(mod, type = 'link', newdata = dat$data_test,
#' summary = FALSE)
#' series1_inds <- which(dat$data_test$series == 'series_1')
#' series2_inds <- which(dat$data_test$series == 'series_2')
#'
#' # Relative Risks are now more comparable among series
#' layout(matrix(1:2, ncol = 2))
#' plot(preds_rr[1, series1_inds], type = 'l', col = 'grey75',
#' ylim = range(preds_rr),
#' ylab = 'Series1 Relative Risk', xlab = 'Time')
#' for(i in 2:50){
#' lines(preds_rr[i, series1_inds], col = 'grey75')
#' }
#'
#' plot(preds_rr[1, series2_inds], type = 'l', col = 'darkred',
#' ylim = range(preds_rr),
#' ylab = 'Series2 Relative Risk', xlab = 'Time')
#' for(i in 2:50){
#' lines(preds_rr[i, series2_inds], col = 'darkred')
#' }
#' layout(1)
#'
#'
#' # Example showcasing how cbind() is needed for Binomial observations
#' # Simulate two time series of Binomial trials
#' trials <- sample(c(20:25), 50, replace = TRUE)
#' x <- rnorm(50)
#' detprob1 <- plogis(-0.5 + 0.9*x)
#' detprob2 <- plogis(-0.1 -0.7*x)
#' dat <- rbind(data.frame(y = rbinom(n = 50, size = trials, prob = detprob1),
#' time = 1:50,
#' series = 'series1',
#' x = x,
#' ntrials = trials),
#' data.frame(y = rbinom(n = 50, size = trials, prob = detprob2),
#' time = 1:50,
#' series = 'series2',
#' x = x,
#' ntrials = trials))
#' dat <- dplyr::mutate(dat, series = as.factor(series))
#' dat <- dplyr::arrange(dat, time, series)
#' plot_mvgam_series(data = dat, series = 'all')
#'
#' # Fit a model using the binomial() family; must specify observations
#' # and number of trials in the cbind() wrapper
#' mod <- mvgam(cbind(y, ntrials) ~ series + s(x, by = series),
#' family = binomial(),
#' data = dat,
#' chains = 2)
#' summary(mod)
#' pp_check(mod, type = "bars_grouped",
#' group = "series", ndraws = 50)
#' pp_check(mod, type = "ecdf_overlay_grouped",
#' group = "series", ndraws = 50)
#' conditional_effects(mod, type = 'link')
#' }
#'@export
mvgam = function(formula,
trend_formula,
knots,
trend_knots,
data,
data_train,
newdata,
data_test,
run_model = TRUE,
prior_simulation = FALSE,
return_model_data = FALSE,
family = 'poisson',
share_obs_params = FALSE,
use_lv = FALSE,
n_lv,
trend_map,
trend_model = 'None',
drift = FALSE,
noncentred = FALSE,
chains = 4,
burnin = 500,
samples = 500,
thin = 1,
parallel = TRUE,
threads = 1,
priors,
refit = FALSE,
lfo = FALSE,
residuals = TRUE,
use_stan = TRUE,
backend = getOption("brms.backend", "cmdstanr"),
algorithm = getOption("brms.algorithm", "sampling"),
autoformat = TRUE,
save_all_pars = FALSE,
max_treedepth = 12,
adapt_delta = 0.85,
silent = 1,
jags_path,
...){
# Check data arguments
if(missing("data") & missing("data_train")){
stop('Argument "data" is missing with no default')
}
if(!missing("data")) data_train <- data
if(!missing("newdata")) data_test <- newdata
orig_data <- data_train
# Validate trend_model
orig_trend_model <- trend_model
trend_model <- validate_trend_model(orig_trend_model,
drift = drift,
noncentred = noncentred)
# Ensure series and time variables are present
data_train <- validate_series_time(data_train, name = 'data',
trend_model = trend_model)
# Validate the formula to convert any dynamic() terms
formula <- interpret_mvgam(formula,
N = max(data_train$index..time..index),
family = family)
# Check sampler arguments
validate_pos_integer(chains)
validate_pos_integer(threads)
validate_pos_integer(burnin)
validate_pos_integer(samples)
validate_pos_integer(thin)
validate_silent(silent)
# Upper bounds no longer supported as they are fairly useless
upper_bounds <- rlang::missing_arg()
# Check for gp terms in the validated formula
list2env(check_gp_terms(formula, data_train, family = family),
envir = environment())
# Check for missing rhs in formula
list2env(check_obs_intercept(formula, orig_formula),
envir = environment())
# Check for brmspriors
if(!missing(priors)){
if(inherits(priors, 'brmsprior') & !lfo & use_stan){
priors <- adapt_brms_priors(priors = priors,
formula = orig_formula,
trend_formula = trend_formula,
data = data_train,
family = family,
use_lv = use_lv,
n_lv = n_lv,
trend_model = trend_model,
trend_map = trend_map,
drift = drift,
warnings = TRUE,
knots = knots)
}
}
# Ensure series and time variables are present
data_train <- validate_series_time(data_train, name = 'data',
trend_model = trend_model)
if(!missing(data_test)){
data_test <- validate_series_time(data_test,
name = 'newdata',
trend_model = trend_model)
if(trend_model == 'CAR1'){
data_test$index..time..index <- data_test$index..time..index +
max(data_train$index..time..index)
}
}
# Lighten the final object if this is an lfo run
if(lfo) return_model_data <- FALSE
# Validate observation formula
formula <- interpret_mvgam(formula, N = max(data_train$index..time..index))
data_train <- validate_obs_formula(formula,
data = data_train,
refit = refit)
if(!missing(data_test)){
data_test <- validate_obs_formula(formula,
data = data_test,
refit = refit)
}
if(is.null(attr(terms(formula(formula)), 'offset'))){
offset <- FALSE
} else {
offset <- TRUE
}
# Ensure fitting software can be located
if(!use_stan & run_model) find_jags(jags_path = jags_path)
if(use_stan & run_model) find_stan()
# Validate the family and threads arguments
family <- validate_family(family, use_stan = use_stan)
family_char <- match.arg(arg = family$family,
choices = family_char_choices())
threads <- validate_threads(family_char, threads)
# Nmixture additions?
list2env(check_nmix(family, family_char,
trend_formula, trend_model,
trend_map, data_train), envir = environment())
# Validate remaining trend arguments
trend_val <- validate_trend_restrictions(trend_model = trend_model,
formula = formula,
trend_formula = trend_formula,
trend_map = trend_map,
drift = drift,
drop_obs_intercept = drop_obs_intercept,
use_lv = use_lv,
n_lv = n_lv,
data_train = data_train,
use_stan = use_stan)
list2env(trend_val, envir = environment())
if(is.null(trend_map)) trend_map <- rlang::missing_arg()
if(is.null(n_lv)) n_lv <- rlang::missing_arg()
# Some general family-level restrictions can now be checked
orig_y <- data_train$y
if(any(!is.na(orig_y))){
validate_family_resrictions(response = orig_y, family = family)
}
# Fill in missing observations in data_train so the size of the dataset is correct when
# building the initial JAGS model
resp_terms <- as.character(terms(formula(formula))[[2]])
if(length(resp_terms) == 1){
out_name <- as.character(terms(formula(formula))[[2]])
} else {
if(any(grepl('cbind', resp_terms))){
resp_terms <- resp_terms[-grepl('cbind', resp_terms)]
out_name <- resp_terms[1]
}
}
data_train[[out_name]] <- replace_nas(data_train[[out_name]])
# Compute default priors
if(use_stan){
def_priors <- adapt_brms_priors(c(make_default_scales(orig_y,
family),
make_default_int(orig_y,
family = if(add_nmix){
nmix()
} else {
family
})),
formula = orig_formula,
trend_formula = trend_formula,
data = orig_data,
family = family,
use_lv = use_lv,
n_lv = n_lv,
trend_model = trend_model,
trend_map = trend_map,
drift = drift,
knots = knots)
}
# Initiate the GAM model using mgcv so that the linear predictor matrix can be easily calculated
# when simulating from the Bayesian model later on;
ss_gam <- try(mvgam_setup(formula = formula,
knots = knots,
family = family_to_mgcvfam(family),
dat = data_train),
silent = TRUE)
if(inherits(ss_gam, 'try-error')){
if(grepl('missing values', ss_gam[1])){
stop(paste('Missing values found in data predictors:\n',
attr(ss_gam, 'condition')),
call. = FALSE)
} else {
stop(paste(ss_gam[1]),
call. = FALSE)
}
}
# Check the test data for NAs as well using predict.gam
testdat_pred <- try(predict(ss_gam,
newdata = data_test,
na.action = na.fail),
silent = TRUE)
if(inherits(testdat_pred, 'try-error')){
if(grepl('missing values', testdat_pred[1])){
stop(paste('Missing values found in newdata predictors:\n',
attr(testdat_pred, 'condition')),
call. = FALSE)
}
}
# Make JAGS file and appropriate data structures
list2env(jagam_setup(ss_gam = ss_gam,
formula = formula,
data_train = data_train,
family = family,
family_char = family_char,
knots = knots), envir = environment())
# Update initial values of lambdas using the full estimates from the
# fitted gam model to speed convergence; remove initial betas so that the
# chains can start in very different regions of the parameter space
ss_jagam$jags.ini$b <- NULL
if(length(ss_gam$sp) == length(ss_jagam$jags.ini$lambda)){
ss_jagam$jags.ini$lambda <- ss_gam$sp
ss_jagam$jags.ini$lambda[log(ss_jagam$jags.ini$lambda) > 10] <- exp(10)
}
if(length(ss_gam$smooth) == 0) ss_jagam$jags.ini$lambda <- NULL
# Fill y with NAs if this is a simulation from the priors;
# otherwise replace with the original supplied values
data_train <- check_priorsim(prior_simulation,
data_train, orig_y,
formula)
# Read in the base (unmodified) jags model file
base_model <- suppressWarnings(readLines(file_name))
# Remove lines from the linear predictor section
lines_remove <- c(1:grep('## response', base_model))
base_model <- base_model[-lines_remove]
if(any(grepl('scale <- 1/tau', base_model, fixed = TRUE))){
base_model <- base_model[-grep('scale <- 1/tau',
base_model, fixed = TRUE)]
}
if(any(grepl('tau ~ dgamma(.05,.005)', base_model, fixed = TRUE))){
base_model <- base_model[-grep('tau ~ dgamma(.05,.005)',
base_model, fixed = TRUE)]
}
# Any parametric effects in the gam (particularly the intercept) need sensible priors to ensure they
# do not directly compete with the latent trends
if(any(grepl('Parametric effect priors', base_model))){
in_parenth <- regmatches(base_model[grep('Parametric effect priors',
base_model) + 1],
gregexpr( "(?<=\\().+?(?=\\))", base_model[grep('Parametric effect priors',
base_model) + 1], perl = T))[[1]][1]
n_terms <- as.numeric(sub(".*:", "", in_parenth))
ss_jagam$jags.data$p_coefs <- coef(ss_gam)[1:n_terms]
# Use the initialised GAM's estimates for parametric effects, but widen them
# substantially to allow for better exploration
beta_sims <- rmvn(100, coef(ss_gam), ss_gam$Vp)
ss_jagam$jags.data$p_taus <- apply(as.matrix(beta_sims[,1:n_terms]),
2, function(x) 1 / (sd(x) ^ 2))
base_model[grep('Parametric effect priors',
base_model) + 1] <- paste0(' for (i in 1:',
n_terms,
') { b[i] ~ dnorm(p_coefs[i], p_taus[i]) }')
base_model[grep('Parametric effect priors',
base_model)] <- c(' ## parametric effect priors (regularised for identifiability)')
}
# For any random effect smooths, use non-centred parameterisation to avoid degeneracies
# For monotonic smooths, need to determine which direction to place
# coefficient constraints
smooth_labs <- do.call(rbind, lapply(seq_along(ss_gam$smooth), function(x){
data.frame(label = ss_gam$smooth[[x]]$label,
class = class(ss_gam$smooth[[x]])[1],
id = ifelse(is.null(ss_gam$smooth[[x]]$id),
NA, ss_gam$smooth[[x]]$id))
}))
# Check for 'id' arguments, which are not yet supported
if(any(!is.na(smooth_labs$id))){
stop('smooth terms with the "id" argument not yet supported by mvgam',
call. = FALSE)
}
if(any(smooth_labs$class == 'random.effect')){
re_smooths <- smooth_labs %>%
dplyr::filter(class == 'random.effect') %>%
dplyr::pull(label)
for(i in 1:length(re_smooths)){
# If there are multiple smooths with this label, find out where the random effect
# smooth sits
smooth_labs %>%
dplyr::filter(label == re_smooths[i]) %>%
dplyr::mutate(smooth_number = dplyr::row_number()) %>%
dplyr::filter(class == 'random.effect') %>%
dplyr::pull(smooth_number) -> smooth_number
in_parenth <- regmatches(base_model[grep(re_smooths[i],
base_model, fixed = T)[smooth_number] + 1],
gregexpr( "(?<=\\().+?(?=\\))",
base_model[grep(re_smooths[i],
base_model, fixed = T)[smooth_number] + 1],
perl = T))[[1]][1]
n_terms <- as.numeric(sub(".*:", "", in_parenth))
n_start <- as.numeric(strsplit(sub(".*\\(", "", in_parenth), ':')[[1]][1])
base_model[grep(re_smooths[i],
base_model, fixed = T)[smooth_number] + 1] <- paste0(' for (i in ', n_start, ':',
n_terms,
') {\n b_raw[i] ~ dnorm(0, 1)\n',
'b[i] <- ',
paste0('mu_raw', i), ' + b_raw[i] * ',
paste0('sigma_raw', i), '\n }\n ',
paste0('sigma_raw', i), ' ~ dexp(0.5)\n',
paste0('mu_raw', i), ' ~ dnorm(0, 1)')
base_model[grep(re_smooths[i],
base_model, fixed = T)[smooth_number]] <- paste0(' ## prior (non-centred) for ', re_smooths[i], '...')
}
}
base_model[grep('smoothing parameter priors',
base_model)] <- c(' ## smoothing parameter priors...')
# Remove the fakery lines if they were added
if(!smooths_included){
base_model <- base_model[-c(grep('## prior for s(fakery)',
trimws(base_model), fixed = TRUE):
(grep('## prior for s(fakery)',
trimws(base_model), fixed = TRUE) + 7))]
}
# Add replacement lines for priors, trends and the linear predictor
fil <- tempfile(fileext = ".xt")
modification <- add_base_dgam_lines(use_lv)
cat(c(readLines(textConnection(modification)), base_model), file = fil,
sep = "\n")
model_file <- trimws(readLines(fil, n = -1))
# Modify observation distribution lines
if(family_char == 'tweedie'){
model_file <- add_tweedie_lines(model_file, upper_bounds = upper_bounds)
} else if(family_char == 'poisson'){
model_file <- add_poisson_lines(model_file, upper_bounds = upper_bounds)
} else {
if(missing(upper_bounds)){
model_file[grep('y\\[i, s\\] ~', model_file)] <- ' y[i, s] ~ dnegbin(rate[i, s], phi[s])'
model_file[grep('ypred\\[i, s\\] ~', model_file)] <- ' ypred[i, s] ~ dnegbin(rate[i, s], phi[s])'
}
}
# Modify lines needed for the specified trend model
model_file <- add_trend_lines(model_file, stan = FALSE,
use_lv = use_lv,
trend_model = if(trend_model %in%
c('RW', 'VAR1',
'PWlinear', 'PWlogistic')){'RW'} else {trend_model},
drift = drift)
# Use informative priors based on the fitted mgcv model to speed convergence
# and eliminate searching over strange parameter spaces
if(length(ss_gam$sp) == length(ss_jagam$jags.ini$lambda)){
model_file[grep('lambda\\[i\\] ~', model_file)] <- ' lambda[i] ~ dexp(1/sp[i])'
} else {
model_file[grep('lambda\\[i\\] ~', model_file)] <- ' lambda[i] ~ dexp(0.05)'
}
# Final tidying of the JAGS model for readability
clean_up <- vector()
for(x in 1:length(model_file)){
clean_up[x] <- model_file[x-1] == "" & model_file[x] == ""
}
clean_up[is.na(clean_up)] <- FALSE
model_file <- model_file[!clean_up]
# Add in the offset if needed
if(offset){
model_file[grep('eta <- X %*% b', model_file, fixed = TRUE)] <-
"eta <- X %*% b + offset"
model_file[grep('eta <- X * b', model_file, fixed = TRUE)] <-
"eta <- X * b + offset"
if(!missing(data_test) & !prior_simulation){
ss_jagam$jags.data$offset <- c(ss_jagam$jags.data$offset,
data_test[[get_offset(ss_gam)]])
}
}
model_file_jags <- textConnection(model_file)
# Covariate dataframe including training and testing observations
if(!missing(data_test) & !prior_simulation){
suppressWarnings(lp_test <- try(predict(ss_gam,
newdata = data_test,
type = 'lpmatrix'),