In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
from scipy.optimize import minimize

from models.sandwiched_least_squares import sandwiched_LS_scalar, sandwiched_LS_diag, sandwiched_LS_dense

# Scalar case

Let $R \in R^{n \times d}, W\in R^{D\times d}, \Delta \in R,$ and $X \in R^{n \times D}$. Let $\lambda > 0$. Then the minimum of 
\begin{align*}
    J(\Delta) = \frac{1}{n} \sum_{i=1}^n \big\| R_i - W^\top\Delta X_i \big\|^2 + \lambda \Delta^2
\end{align*}
is uniquely attained by
\begin{align*}
    \Delta_{\textnormal{scalar}} 
    &= \frac{\langle R, XW^\top\rangle_F}{\|X W\|_F^2 + n\lambda} 
    = \frac{\frac{1}{n}\sum_{i=1}^n \langle W^\top X_i,  R_i\rangle}{\frac{1}{n}\sum_{i=1}^n \|W^\top X_i\|^2 + \lambda }.
\end{align*}

In [2]:
# Parameters
d = 200
D = 300
N = 100
l2_reg = 0.01

# Create dummy data
np.random.seed(0)
r = np.random.randn(N, d)+2
W = np.random.randn(D, d)/100
x = np.random.randn(N, D)-1

def J(Delta):
    Wx = W.T @ x.T
    residual = r - Wx.T * Delta
    return np.mean(np.linalg.norm(residual, axis=1)**2) + l2_reg * Delta**2

def J_byhand(Delta):
    res = 0
    for i in range(N):
        res += np.linalg.norm(r[i] - W.T @ x[i] * Delta)**2 / N
    return res + l2_reg * Delta**2

In [3]:
# Closed form solution
Delta_closed_form = sandwiched_LS_scalar(tensor(r), tensor(W), tensor(x), l2_reg).numpy()

print(f"Closed form solution for Delta: {Delta_closed_form}")
print(f"Objective value for closed form solution: {J(Delta_closed_form)}")
print(f"Objective value for closed form solution (by hand): {J_byhand(Delta_closed_form)}")

Closed form solution for Delta: 0.17964255108434818
Objective value for closed form solution: 992.8392404324366
Objective value for closed form solution (by hand): 992.8392404324366


In [4]:
# Gradient descent solution using scipy.optimiz
result = minimize(J, np.random.randn(), method='BFGS')
Delta = result.x[0]
print(f"Gradient descent solution for Delta using scipy.optimize: {Delta}")
print(f"Objective value for gradient descent solution: {J(Delta)}")
print(f"Objective value for closed form solution (by hand): {J_byhand(Delta)}")

Gradient descent solution for Delta using scipy.optimize: 0.17964211768408475
Objective value for gradient descent solution: 992.8392404324388
Objective value for closed form solution (by hand): 992.8392404324385


# Diagonal Case

Let $R \in R^{n \times d}, W\in R^{D\times d}, \Delta = \textnormal{diag}(\delta_1, ..., \delta_D) \in R^{D \times D},$ and $X \in R^{n \times D}$. Let $\lambda > 0$. Then the minimum of 
\begin{align*}
    J(\Delta) = \frac{1}{n} \sum_{i=1}^n \big\| R_i - W^\top\Delta X_i \big\|^2 + \lambda \Delta^2
\end{align*}
is uniquely attained by the solution to the system of linear equations
\begin{align*}
    b = (A+ \lambda I)\Delta
\end{align*}
where
\begin{align*}
    A = W W^\top \odot X^\top X,  \qquad \qquad b = \textnormal{diag}(W^\top R^\top X).
\end{align*}


In [5]:
# Parameters
d = 30
D = 20
N = 1000
l2_reg = 10

# Create dummy data
np.random.seed(0)
r = np.random.randn(N, d)+2
W = np.random.randn(D, d)/100
x = np.random.randn(N, D)-1

def A_byhand():
    A = np.zeros((D, D))
    for k in range(D):
        for j in range(D):
            A[k, j] = np.mean([ x[i, k] * x[i, j] * np.dot(W[k], W[j]) for i in range(N)])
    return A


def A():
    return (W @ W.T) * (x.T @ x) / N

def b_byhand():
    b = np.zeros(D)
    for k in range(D):
        b[k] = np.mean([ x[i, k] * np.dot(W[k], r[i]) for i in range(N)])
    return b


def b():
    return np.mean( (r @ W.T) * x, axis=0)
    #return np.diag(W @ r.T @ x) / N
    #return np.einsum('nd,kd,nk->k', r, W, x) / N

In [6]:
(A() - A_byhand()).mean()

1.5242357885841135e-20

In [7]:
(b() - b_byhand()).mean()

1.5959455978986624e-17

In [8]:
def J(Delta):
    return np.mean(np.linalg.norm(r - x @ np.diag(Delta) @ W, axis=1)**2) + l2_reg * np.linalg.norm(Delta)**2

def J_byhand(Delta):
    res = 0
    for i in range(N):
        res += np.linalg.norm(r[i] - W.T @ (Delta*x[i]))**2 / N
    return res + l2_reg * np.sum(Delta**2)

In [9]:
Delta_closed_form = sandwiched_LS_diag(tensor(r), tensor(W), tensor(x), l2_reg).numpy()
print(f"Closed form solution for Delta: {Delta_closed_form}")
print(f"Objective value for closed form solution: {J(Delta_closed_form)}")
print(f"Objective value for closed form solution (by hand): {J_byhand(Delta_closed_form)}")

Closed form solution for Delta: [-0.00118117  0.0088751  -0.00139431 -0.02298253 -0.01659739  0.00393831
  0.01807629  0.00768456  0.00326887 -0.00505116 -0.00754621  0.00641875
  0.00796784  0.00098712 -0.00994351  0.0028343   0.01790257 -0.00202378
  0.00719184  0.01089629]
Objective value for closed form solution: 149.01900449101015
Objective value for closed form solution (by hand): 149.01900449101026


In [10]:
# Gradient descent solution using scipy.optimiz
result = minimize(J, np.random.randn(D), method='BFGS')
Delta = result.x
print(f"Gradient descent using scipy.optimize: {Delta}")
print(f"Objective value for gradient descent solution: {J(Delta)}")
print(f"Objective value for closed form solution (by hand): {J_byhand(Delta)}")

Gradient descent using scipy.optimize: [-0.00118107  0.00887488 -0.00139438 -0.02298261 -0.01659761  0.00393813
  0.01807629  0.00768457  0.00326876 -0.00505104 -0.00754647  0.00641864
  0.00796751  0.00098706 -0.00994363  0.00283416  0.0179027  -0.00202393
  0.00719158  0.01089627]
Objective value for gradient descent solution: 149.01900449101524
Objective value for closed form solution (by hand): 149.0190044910153


# Dense Case

Let $R \in R^{n \times d}, W\in R^{D\times d}, \Delta = \in R^{D \times D},$ and $X \in R^{n \times D}$. Let $\lambda > 0$. Then the minimum of 
\begin{align*}
    J(\Delta) 
        &= \frac{1}{n} \sum_{i=1}^n \big\| r_i - W^\top \Delta x_i \big\|^2 + \sum_{k=1}^D\sum_{j=1}^p \lambda \Delta_{k,j}^2 \\
        &= \frac{1}{n}\| W^\top \Delta X^\top - R^\top\|^2_F + \lambda \|\Delta\|^2_F
\end{align*}
is uniquely obtained by solving the system of linear equations given by
\begin{align*} 
    W R^\top X    =  W W^\top \Delta X^\top X + \lambda n \Delta
\end{align*}
which can be solved by spectral decomposition $W W^\top = U \Lambda^W U^\top$  and $X^\top X = V \Lambda^X V^\top$
\begin{align*}
    \Delta_{\textnormal{dense}} = U \bigg[ U^\top W R^\top X V \oslash \bigg(\lambda N 1 + \textnormal{diag}(\Lambda^W) \otimes \textnormal{diag}(\Lambda^X)\bigg) \bigg] V^\top
\end{align*}
where $\oslash$ denotes element-wise division, $\otimes$ is the outer product, and $1$ is a matrix of ones.


In [11]:
# Parameters
d = 10
D = 50
p = 20 
N = 100
l2_reg = 0.1

# Create dummy data
np.random.seed(0)
r = np.random.randn(N, d)+2
W = np.random.randn(D, d)/100
x = np.random.randn(N, p)-1

def J(Delta):
    Delta = Delta.reshape(D, p)
    return 1/N * np.linalg.norm(W.T @ Delta @ x.T - r.T)**2 + l2_reg * np.linalg.norm(Delta)**2

def J_byhand(Delta):
    Delta = Delta.reshape(D, p)
    res = 0
    for i in range(N):
        res += 1/N * np.linalg.norm(r[i] - W.T @ Delta @ x[i])**2
    return res + l2_reg * np.linalg.norm(Delta)**2

In [17]:
Delta_closed_form = sandwiched_LS_dense(tensor(r), tensor(W), tensor(x), l2_reg).numpy()
print(f"Closed form solution for Delta: {Delta_closed_form}")
print(f"Objective value for closed form solution: {J(Delta_closed_form)}")
print(f"Objective value for closed form solution (by hand): {J_byhand(Delta_closed_form)}")

Closed form solution for Delta: [[-7.33169147e-02 -8.33842047e-02 -5.97394795e-02 -3.66594917e-02
  -5.17126707e-02 -2.70211400e-02 -7.24434676e-02 -5.56257519e-02
  -5.13040410e-02 -6.04864949e-03 -3.12478652e-02 -7.44127907e-02
  -3.43815253e-02 -8.27462312e-02 -1.41137318e-01 -6.69247797e-02
  -4.02906258e-02 -3.39438981e-02 -4.84381350e-02 -1.02109134e-01]
 [-1.96006795e-02  4.11938165e-02  7.05829857e-02  4.38026326e-02
   4.76512563e-02  2.01511283e-02  1.29624411e-02  2.75973615e-02
   4.75728515e-02  9.10428015e-02  6.15285294e-02 -2.40551025e-02
   7.50161859e-02 -1.02956899e-02 -6.52938179e-02  2.30374332e-02
   3.65827516e-02  8.15681114e-03 -1.05987205e-02 -2.49662135e-02]
 [ 2.11697927e-01  1.11459416e-01  1.47573336e-01  1.93804869e-01
   1.79079940e-01  2.33155193e-01  1.93890409e-01  1.07463453e-01
   1.69383160e-01  1.46881923e-01  1.91276255e-01  1.42398917e-01
   2.10849670e-01  1.16469985e-01  1.20832289e-01  1.60584050e-01
   1.89172966e-01  2.24425214e-01  1.61770

In [18]:
# Gradient descent solution using scipy.optimiz
result = minimize(J, np.random.randn(p*D), method='L-BFGS-B')
Delta = result.x.reshape(p, D)
print(f"Gradient descent using scipy.optimize: {Delta}")
print(f"Objective value for gradient descent solution: {J(Delta)}")
print(f"Objective value for closed form solution (by hand): {J_byhand(Delta)}")

Gradient descent using scipy.optimize: [[-7.33266660e-02 -8.33894981e-02 -5.97389374e-02 -3.66640117e-02
  -5.17180998e-02 -2.70321063e-02 -7.24451549e-02 -5.56272239e-02
  -5.13080735e-02 -6.04292808e-03 -3.12474644e-02 -7.44105695e-02
  -3.43836589e-02 -8.27518225e-02 -1.41141041e-01 -6.69274486e-02
  -4.02921597e-02 -3.39452908e-02 -4.84413428e-02 -1.02113623e-01
  -1.96035004e-02  4.11926253e-02  7.05765185e-02  4.38042549e-02
   4.76488810e-02  2.01342443e-02  1.29563170e-02  2.75940338e-02
   4.75658869e-02  9.10382753e-02  6.15349316e-02 -2.40465414e-02
   7.50156336e-02 -1.02991879e-02 -6.52989452e-02  2.30410577e-02
   3.65795718e-02  8.15614916e-03 -1.06073527e-02 -2.49616459e-02
   2.11690498e-01  1.11459095e-01  1.47578536e-01  1.93803821e-01
   1.79083484e-01  2.33156430e-01  1.93893306e-01  1.07476012e-01
   1.69391898e-01  1.46891958e-01]
 [ 1.91276205e-01  1.42401474e-01  2.10855907e-01  1.16484090e-01
   1.20836365e-01  1.60587383e-01  1.89171864e-01  2.24418742e-01
  