# Lecture 15

We want to estimate our low rank affinity matrix. We will look to perform proximal gradient descent with nuclear norm regularization, to find the low rank affinity matrix that best approximates the matching in the data.

In [27]:
affinity = function(Xvals, Yvals, sigma = 1, lambda = 1) {
    phis = kronecker(t(Yvals), t(Xvals))
    dX = dim(Xvals)[2]
    dY = dim(Yvals)[2]
    n = dim(Xvals)[1]
    if (n != dim(Yvals)[1]) {
        stop("Dimensions of Xvals and Yvals do not match.")
    }
    
    p = rep(1/n, n)
    q = rep(1/n, n)
    IX = rep(1, n)
    tIY = matrix(rep(1, n), nrow = 1)
    f = p %*% tIY
    g = IX %*% t(q)
    pihat = diag(n)/n
    v = rep(0, n)
    
    A = rep(0, dX * dY)
    t_k = 0.3  # step size for the prox grad algorithm (or grad descent when lambda=0)
    
    iterCount = 0
    
    while (1) {
        # Compute pi_A
        Phi = Xvals %*% matrix(A, nrow = dX) %*% t(Yvals)
        contIpfp = TRUE
        iterIpfp = 0
        while (contIpfp) {
            iterIpfp = iterIpfp + 1
            u = sigma * log(apply(g * exp((Phi - IX %*% t(v))/sigma), 1, sum))
            vnext = sigma * log(apply(f * exp((Phi - u %*% tIY)/sigma), 2, sum))
            error = max(abs(apply(g * exp((Phi - IX %*% t(vnext) - u %*% tIY)/sigma), 
                1, sum) - 1))
            if ((error < tolIpfp) | (iterIpfp >= maxiterIpfp)) {
                contIpfp = FALSE
            }
            v = vnext
        }
        
        pi = f * g * exp((Phi - IX %*% t(v) - u %*% tIY)/sigma)
        
        if (iterIpfp >= maxiterIpfp) {
            stop("maximum number of iterations reached")
        }
        
        # do prox grad descent
        thegrad = c(phis %*% c(pi - pihat))
        
        # take one gradient step
        A = A - t_k * thegrad
        
        if (lambda > 0) 
            {
                # compute the proximal operator
                SVD = svd(matrix(A, nrow = dX))
                U = SVD$u
                D = SVD$d
                V = SVD$v
                
                D = pmax(D - lambda * t_k, 0)
                A = c(U %*% diag(D) %*% t(V))
            }  # if lambda = 0 then we are just taking one step of gradient descent
        
        
        ### testing optimality
        if (iterCount%%10 == 0) {
            alpha = 1
            tmp = svd(matrix(A - alpha * thegrad, nrow = dX))
            tmp_second = sum((A - c(tmp$u %*% diag(pmax(tmp$d - alpha * lambda, 0)) %*% 
                t(tmp$v)))^2)
            cat("testing optimality ", tmp_second, "\n")
        }
        
        if (lambda > 0) {
            theval = sum(thegrad * c(A)) - sigma * sum(pi * log(pi)) + lambda * sum(D)
        } else {
            theval = sum(thegrad * c(A)) - sigma * sum(pi * log(pi))
        }
        
        iterCount = iterCount + 1
        
        if (iterCount > 1 && abs(theval - theval_old) < 1e-06) {
            break
        }
        theval_old = theval   
    }
    return(list(A = matrix(A, nrow = dX), val = theval))
}

We will compute this for a fixed $\lambda$. We could vary the value of $\lambda$ using cross-validation to get the desired level of rank reduction.

In [37]:
mydata <- read.csv("DGS_low_rank_april16.csv")
#Xvals = mydata[,c(1:22, 45:48)]
#Yvals = mydata[,c(23:44, 56:59)]

Xvals = mydata[,c(45:48)]
Yvals = mydata[,c(56:59)]
tolIpfp = 1e-12
maxiterIpfp = 1000

seed = 777
set.seed(seed)

# Standardize
meanX = apply(Xvals, 2, mean)
meanY = apply(Yvals, 2, mean)
sdX = apply(Xvals, 2, sd)
sdY = apply(Yvals, 2, sd)

Xvals = t(t(Xvals) - meanX)
Yvals = t(t(Yvals) - meanY)
Xvals = t(t(Xvals)/sdX)
Yvals = t(t(Yvals)/sdY)

res = affinity(Xvals, Yvals, sigma=1, lambda=0.2)

testing optimality  0.09042313 
testing optimality  0.0001098681 


In [40]:
A = res$A
val = res$val
A

0,1,2,3
0.259367257,0.019745466,-0.06198573,0.002713854
0.029096742,0.002923766,-0.0116027,0.003663925
-0.054458621,-0.007945686,0.03794256,-0.018583367
-0.009374241,0.003116353,-0.02288552,0.018058736


In [39]:
qr(A)$rank