In [1]:
using Pkg
Pkg.activate(joinpath(pwd(),".."))
using Random, Distributions, PrettyTables, SCS, Plots, LinearAlgebra, Convex
using Base.Threads: @threads, @spawn
Random.seed!(1234)

[32m[1m  Activating[22m[39m project at `~/Desktop/MachineLearning`


TaskLocalRNG()

# (a)

In [16]:
function data(n, k, ρ, σ, β₁, β₂)
    #Set covariance matrix
    Σ = ρ .* ones(k,k)
    Σ[diagind(Σ)] .= 1
    μ = zeros(k)
    mvnormal = MvNormal(μ,Σ)
    X = rand(mvnormal, n)'

    normal = Normal(0, σ^2)
    U = rand(normal, n)
    x1 = X[:,1]
    x2 = X[:,2]
    Y = 1 .+ β₁ .* x1 + β₂ .* x2 .+ U

    return( Y = Y, X = X)
end


data (generic function with 1 method)

# (b)

In [17]:
MOI = Convex.MOI
opt = MOI.OptimizerWithAttributes(SCS.Optimizer, "eps_abs"=>1.0e-08, "eps_rel"=>1.0e-08, MOI.Silent() => true)

MathOptInterface.OptimizerWithAttributes(SCS.Optimizer, Pair{MathOptInterface.AbstractOptimizerAttribute, Any}[MathOptInterface.RawOptimizerAttribute("eps_abs") => 1.0e-8, MathOptInterface.RawOptimizerAttribute("eps_rel") => 1.0e-8, MathOptInterface.Silent() => true])

In [21]:
function lasso(Y, X, λ)
    n = length(Y)
    X = [X ones(n,1)]
    k = size(X, 2)

    b = Variable(k)
    Q = X'X / 2n
    c = X'Y / 2n                    #c'b = Y'X*b

    L1 = quadform(b, Q)            #b'Q*b
    L2 = dot(c, b)                 #c'b
    L3 = norm(b[1:k-1], 1)                #sum(|b|)
    

    Q = minimize(L1 - 2 * L2 + λ * L3) 
    solve!(Q, opt,verbose=false)

    return b = round.(vec(evaluate(b)),digits=5)
end


lasso (generic function with 1 method)

# (c)

In [25]:
function MC(ρ, σ, β₁, β₂, R, n, k)
    λ = 1.1 * σ * sqrt(2 * log(n*k)/n)
    p1 = 0
    p2 = 0
    p3 = 0
    @threads for i in 1:R
        (Y, X) = data(n, k, ρ, σ, β₁, β₂)
        b = lasso(Y, X, λ)
        p1 += (b[1] != 0)/R
        p2 += (b[2] != 0)/R
        p3 += (norm(b[3:k]) != 0)/R
    end
    return (p1 = round(p1, digits = 5), p2 = round(p2, digits = 5), p3 = round(p3, digits = 5))
end

MC (generic function with 1 method)

# (e) - (g)

In [22]:
(p11, p12, p13) = MC(0.2, 2, 2, 2, 1000, 300, 50)
(p21, p22, p23) = MC(0.2, 0.1, 0.1, 0.1, 1000, 300, 50)
(p31, p32, p33) = MC(0.2, 2, 2, 0.2, 1000, 300, 50)
(p41, p42, p43) = MC(0.9, 2, 2, 2, 1000, 300, 50)


result = ["Group 1" p11 p12 p13; "Group 2" p21 p22 p23; "Group 3" p31 p32 p33; "Group 4" p41 p42 p43]
header = [" ", "Select Xi,1", "Select Xi,2", "Select Irrelevant Regressors"]
pretty_table(result; header = header)

┌─────────┬─────────────┬─────────────┬──────────────────────────────┐
│[1m         [0m│[1m Select Xi,1 [0m│[1m Select Xi,2 [0m│[1m Select Irrelevant Regressors [0m│
├─────────┼─────────────┼─────────────┼──────────────────────────────┤
│ Group 1 │         1.0 │         1.0 │                        0.797 │
│ Group 2 │         1.0 │         1.0 │                          0.0 │
│ Group 3 │         1.0 │       0.117 │                        0.672 │
│ Group 4 │       0.927 │       0.933 │                        0.991 │
└─────────┴─────────────┴─────────────┴──────────────────────────────┘


In part (c), Lasso doesn't correctly select true model. It frequently include irrelevant regressors.

Compare with result from part (d), both procedure selected $X_{i,1}$ and $X_{i,2}$ correctly, but in part (d), no irrelevant regressors were selected. I guess it's because $\sigma^2 = EU_i^2$ is large in part (c), so the model overfitted to reduce the creterion function, since a part of it can be interpreted as $\frac{1}{2n} \sum \hat{U}_i^2$

From part (c) to part (e), we make the effect of $X_{i,2}$ very small, therefore, it can be expected that the probability of selecting $X_{i,2}$ will be significantly lowered since its effect can be too small to be identified. Moreover, the fact that the probability of selecting $X_{i,2}$ or irrelevant regressors in part (e) is roughly the same as probability of selecting irrelevant regressors in part (c) is also an evidence that the effect of $X_{i,2}$ is not identified.

From part (c) to part (f), we increased the covariance between X's. As the X's are more correlated, part of $X_{i,1}$ and $X_{i,2}$'s effects can be misallocated to other X's. We expect the probability of including irrelevant regressors will be higher, and it of selecting $X_{i,1}$ and $X_{i,2}$ correctly will be lower. The result is the same as our expection.