diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 534a2a6..6b2d851 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,6 +21,11 @@ jobs: arch: [x64] os: [ubuntu-18.04] # macos-10.15, windows-2019 steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.9.1 + with: + all_but_latest: true + access_token: ${{ github.token }} - name: Checkout uses: actions/checkout@v2 - name: Julia Setup diff --git a/.github/workflows/documenter.yml b/.github/workflows/documenter.yml index e18d69c..b188c60 100644 --- a/.github/workflows/documenter.yml +++ b/.github/workflows/documenter.yml @@ -11,6 +11,11 @@ jobs: name: Documentation runs-on: ubuntu-latest steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.9.1 + with: + all_but_latest: true + access_token: ${{ github.token }} - uses: actions/checkout@v2 - uses: julia-actions/julia-buildpkg@latest - uses: julia-actions/julia-docdeploy@latest diff --git a/Project.toml b/Project.toml index 937ab34..67cb9f9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,16 +1,19 @@ name = "MixedModelsExtras" uuid = "781a26e1-49f4-409a-8f4c-c3159d78c17e" authors = ["Phillip Alday and contributors"] -version = "0.1.2" +version = "0.1.3" [deps] +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" [compat] MixedModels = "3, 4" StatsBase = "0.33" +StatsModels = "0.6.28" julia = "1.6" [extras] diff --git a/docs/src/api.md b/docs/src/api.md index fd25a9f..bb8eb6e 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -24,3 +24,17 @@ adjr2 ```@docs icc ``` + +## Variance Inflation Factor + +```@docs +vif +``` + +```@docs +termnames +``` + +```@docs +gvif +``` diff --git a/src/MixedModelsExtras.jl b/src/MixedModelsExtras.jl index 66ab07c..a42ccdd 100644 --- a/src/MixedModelsExtras.jl +++ b/src/MixedModelsExtras.jl @@ -2,12 +2,16 @@ module MixedModelsExtras export icc export r², r2, adjr², adjr2 +export termnames, gvif, vif +using LinearAlgebra using MixedModels using Statistics using StatsBase +using StatsModels include("icc.jl") include("r2.jl") +include("vif.jl") end diff --git a/src/vif.jl b/src/vif.jl new file mode 100644 index 0000000..3304e3f --- /dev/null +++ b/src/vif.jl @@ -0,0 +1,124 @@ +_rename_intercept(s) = strip(s) == "1" ? "(Intercept)" : s + +""" + termnames(model) + +Return the names associated with (fixed effects) terms in a model. + +For models with only continuous predictors, this is the same as +`coefnames(model)`. +For models with categorical predictors, the returned names reflect +the categorical predictor and not the coefficients resulting from +the choice of contrast coding. +""" +termnames(model) = _rename_intercept.(string.(_terms(model))) + +_terms(model) = collect(formula(model).rhs.terms) +_terms(model::MixedModel) = collect(formula(model).rhs[1].terms) + +""" + vif(m::RegressionModel) + +Compute the variance inflation factor (VIF). + +Returns a vector of inflation factors computed for each coefficient except +for the intercept. +In other words, the corresponding coefficients are `coefnames(m)[2:end]`. + +The [variance inflation factor (VIF)](https://en.wikipedia.org/wiki/Variance_inflation_factor) measures +the increase in the variance of a parameter's estimate in a model with multiple parameters relative to +the variance of a parameter's estimate in a model containing only that parameter. + +See also [`coefnames`](@ref), [`gvif`](@ref). + +!!! warning + This method will fail if there is (numerically) perfect multicollinearity, + i.e. rank deficiency (in the fixed effects). In that case though, the VIF + isn't particularly informative anyway. +""" +function vif(m::RegressionModel) + mm = Statistics.cov2cor!(vcov(m), stderror(m)) + all(==(1), view(modelmatrix(m), :, 1)) || + throw(ArgumentError("VIF only defined for models with an intercept term")) + mm = @view mm[2:end, 2:end] + size(mm, 2) > 1 || + throw(ArgumentError("VIF not meaningful for models with only one non-intercept term")) + # NB: The correlation matrix is positive definite and hence invertible + # unless there is perfect rank deficiency, hence the warning. + # NB: The determinate technique for GVIF could also be applied + # columnwise (instead of Term-wise) here, but it wouldn't really + # be any more efficient because this is a Symmetric matrix and computing + # all those determinants has its cost. The determinants also hint at + # how you could show equivalency, if you remember that inversion is solving + # a linear system and that Cramer's rule -- which uses determinants -- + # can also a linear system + # so we want diag(inv(mm)) but directly computing inverses is bad. + # that said, these are typically small-ish matrices and this is Simple. + return diag(inv(mm)) +end + +""" + gvif(m::RegressionModel; scale=false) + +Compute the generalized variance inflation factor (GVIF). + +If `scale=true`, then each GVIF is scaled by the degrees of freedom +for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))`` + +Returns a vector of inflation factors computed for each term except +for the intercept. +In other words, the corresponding coefficients are `termnames(m)[2:end]`. + +The [generalized variance inflation factor (VIF)](https://doi.org/10.2307/2290467) +measures the increase in the variance of a (group of) parameter's estimate in a model +with multiple parameters relative to the variance of a parameter's estimate in a +model containing only that parameter. For continuous, numerical predictors, the GVIF +is the same as the VIF, but for categorical predictors, the GVIF provides a single +number for the entire group of contrast-coded coefficients associated with a categorical +predictor. + +See also [`termnames`](@ref), [`vif`](@ref). + +!!! warning + This method will fail if there is (numerically) perfect multicollinearity, + i.e. rank deficiency (in the fixed effects). In that case though, the VIF + isn't particularly informative anyway. + +## References + +Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics. +Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467 +""" +function gvif(m::RegressionModel; scale=false) + mm = StatsBase.cov2cor!(vcov(m), stderror(m)) + + StatsModels.hasintercept(formula(m)) || + throw(ArgumentError("GVIF only defined for models with an intercept term")) + mm = @view mm[2:end, 2:end] + size(mm, 2) > 1 || + throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term")) + + tn = @view termnames(m)[2:end] + trms = @view _terms(m)[2:end] + + df = width.(trms) + vals = zeros(eltype(mm), length(tn)) + # benchmarking shows thad adding in Symmetric() here before det() + # actually slows things down a bit + detmm = det(mm) + acc = 0 + for idx in axes(vals, 1) + wt = df[idx] + trm_only = [acc < i <= (acc + wt) for i in axes(mm, 2)] + trm_excl = .!trm_only + vals[idx] = det(view(mm, trm_only, trm_only)) * + det(view(mm, trm_excl, trm_excl)) / + detmm + acc += wt + end + + if scale + vals .= vals .^ (1 ./ (2 .* df)) + end + return vals +end diff --git a/test/Project.toml b/test/Project.toml index e4f8436..6fe61e0 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,9 @@ [deps] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" +RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index 1abbffe..b4fa67b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,3 +8,7 @@ end @testset "r2" begin include("r2.jl") end + +@testset "VIF" begin + include("vif.jl") +end diff --git a/test/vif.jl b/test/vif.jl new file mode 100644 index 0000000..1df3f65 --- /dev/null +++ b/test/vif.jl @@ -0,0 +1,61 @@ +using GLM +using LinearAlgebra +using MixedModels +using MixedModelsExtras +using StatsBase +using Test + +using MixedModels: dataset +using RDatasets: dataset as rdataset + +progress = false + +@testset "LMM" begin + fm0 = fit(MixedModel, @formula(reaction ~ 0 + days + (1 | subj)), dataset(:sleepstudy); + progress) + fm1 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 | subj)), dataset(:sleepstudy); + progress) + fm2 = fit(MixedModel, @formula(reaction ~ 1 + days * days^2 + (1 | subj)), + dataset(:sleepstudy); progress) + + ae_int = ArgumentError("VIF only defined for models with an intercept term") + ae_nterm = ArgumentError("VIF not meaningful for models with only one non-intercept term") + @test_throws ae_int vif(fm0) + @test_throws ae_nterm vif(fm1) + + mm = @view cor(modelmatrix(fm2))[2:end, 2:end] + # this is a slightly different way to do the same + # computation. I've verified that doing it this way + # in R gives the same answers as car::vif + # note that computing the same model from scratch in R + # gives different VIFs ([70.41934, 439.10096, 184.00107]), + # but that model is a slightly different fit than the Julia + # one and that has knock-on effects + @test isapprox(vif(fm2), diag(inv(mm))) + + # since these are all continuous preds, should be the same + # but uses a very different computational method! + @test isapprox(vif(fm2), gvif(fm2)) + # the scale factor is gvif^1/(2df) + # so just sqrt.(vif) when everything is continuous + @test isapprox(gvif(fm2; scale=true), + sqrt.(gvif(fm2))) +end + +@testset "GVIF and RegrssionModel" begin + duncan = rdataset("car", "Duncan") + + lm1 = lm(@formula(Prestige ~ 1 + Income + Education), duncan) + @test termnames(lm1) == coefnames(lm1) + vif_lm1 = vif(lm1) + + # values here taken from car + @test isapprox(vif_lm1, [2.1049, 2.1049]; atol=1e-5) + @test isapprox(vif_lm1, gvif(lm1)) + + lm2 = lm(@formula(Prestige ~ 1 + Income + Education + Type), duncan) + @test termnames(lm2) == ["(Intercept)", "Income", "Education", "Type"] + @test isapprox(gvif(lm2), [2.209178, 5.297584, 5.098592]; atol=1e-5) + @test isapprox(gvif(lm2; scale=true), + [1.486330, 2.301648, 1.502666]; atol=1e-5) +end