# Google Colab, PyTorch, KeOps

by Flavien Léger.

Math+econ+code, Feb 5 2021

References and resources:
* PyTorch: https://pytorch.org/docs/stable/index.html
* KeOps: https://www.kernel-operations.io/keops/index.html
* Jean Feydy's PhD thesis: https://www.jeanfeydy.com/geometric_data_analysis.pdf

# 0. Imports (run once)

In [None]:
from time import time
import random

import numpy as np
import torch
import matplotlib.pyplot as plt

!pip install pykeops[full] > install.log
from pykeops.torch import LazyTensor


# 1. PyTorch and tensors

## a) Basic operations

PyTorch uses **tensors**, i.e. a multidimensional array (a generalization of a matrix).

One-dimensional tensor: $x_i$ for $i=1,\dots,n$.

Two-dimensional tensor: $x_{ij}$ for $i=1,\dots,n, j=1,\dots,m$.

Three-dimensional tensor: $x_{ijk}$ for $i=1,\dots,n$, $j=1,\dots,m$, $k=1,\dots,\ell$.

etc

In [None]:
# Some basic commands

x = torch.zeros(4, 5)   # or: x = torch.zeros( (4, 5) )
x = torch.ones(3, 2)
x = torch.linspace(0, 1, 10)
x = torch.rand(3, 1, 5)
x = torch.randn(2, 2, 3)
print(f'x = {x}')
print(f'x size: {x.size()}\n')    # or x.shape like in NumPy

# Construct a tensor from data
y = torch.Tensor([-4,20,1])
print(f'y = {y}')
print(f'y size: {y.size()}\n')

## b) Reduction operations
Given a 2d tensor $A_{ij}$ we might want to
 - take a sum $x_j = \sum_i A_{ij}$
 - take a maximum $y_i = \max_jA_{ij}$ 
 

In [None]:
torch.manual_seed(20210204)

# Sum
A = torch.rand(7, 2)           # (7, 2) tensor
x = A.sum(dim=0)                # one-dimensional tensor of size 2 

print(f'A = {A}\n')
print(f'x = {x}\n')

# Min or max
y, T = A.max(dim=1)
print(f'y = {y}')
print(f'T = {T}')

## c) Broadcasting

Imagine you have two 1d tensors (i.e. vectors) $x_i$ and $y_j$ ($i=1,\dots,n$, $j=1,\dots,m$), and you want to compute the 2d tensors
$$
S_{ij}=x_i+y_j
$$
or
$$
P_{ij} = x_i y_j.
$$

We can add a dummy dimension to $x$ and $y$ and then *broadcast* the sum or multiplication.

See: http://scipy.github.io/old-wiki/pages/EricsBroadcastingDoc

In [None]:
x = torch.arange(0, 3)  # 1d tensor of length 3
y = torch.arange(0, 5)  # 1d tensor of length 5

x = x[:, None]  # x is now a 2d tensor of shape (3,1)
y = y[None, :]  # y is now a 2d tensor of shape (1,5)
S = x + y

print(f'x = {x}\n')
print(f'y = {y}\n')
print(f'S = {S}')

## d) Example 1: compute a $\Phi$-transform



In [None]:
def plot_pquv(x, y, u, v, max_points=1000):
  N = x.size(0)
  M = y.size(0)
  if x.device.type == 'cuda':
    x = x.cpu()
  if y.device.type == 'cuda':
    y = y.cpu()
  if u.device.type == 'cuda':
    u = u.cpu()
  if v.device.type == 'cuda':
    v = v.cpu()



  idx1 = random.sample(range(N),max_points) if N > max_points else range(N)
  idx2 = random.sample(range(M),max_points) if M > max_points else range(M)

  plt.rcParams['figure.figsize'] = [12, 8]
  fig, ax = plt.subplots(1, 2, sharex=True, sharey=True)
  plt.setp(ax.flat, aspect=1.0, adjustable='box')
  ax[0].scatter(x[idx1,0], x[idx1,1], c=u[idx1])
  ax[0].set_title('p and u')
  ax[1].scatter(y[idx2,0], y[idx2,1], c=v[idx2])
  ax[1].set_title('q and v')



def plot_pqT(x, y, T, max_points=100):
  N = x.size(0)
  M = y.size(0)

  idx1 = random.sample(range(N),max_points) if N > max_points else range(N)
  idx2 = T[idx1]

  plt.rcParams['figure.figsize'] = [24, 8]
  fig, ax = plt.subplots(1, 4, sharex=True, sharey=True)
  plt.setp(ax.flat, aspect=1.0, adjustable='box')

  ax[0].scatter(x[idx1,0], x[idx1,1], c='C0')
  ax[0].set_title('p')

  ax[1].scatter(y[idx2,0], y[idx2,1], c='C1')
  ax[1].set_title('q')

  ax[2].quiver(x[idx1,0], x[idx1,1], y[idx2, 0]-x[idx1,0], y[idx2,1]-x[idx1,1], angles='xy', scale_units='xy', scale=1, alpha=0.2);
  ax[2].scatter(x[idx1,0], x[idx1,1], alpha=0.2)
  ax[2].scatter(y[idx2,0], y[idx2,1], c=v[idx2])
  ax[2].set_title('Matching T with prices $v_j$')

  ax[3].scatter(x[idx1,0], x[idx1,1], c=u[idx1])
  ax[3].set_title('Indirect utility $u_i$')



def init_twonormals(N, M, seed=20210204):
  x = torch.randn((N,2))    # (N,2)
  y = torch.randn((M,2))    # (M,2)
  y /= 3
  return x, y



Assume $(x_i)\in \mathbb{R}^{n\times 2}$ and $(y_j)\in\mathbb{R}^{m\times 2}$ are points 2d. Compute the surplus matrix
$$
\Phi_{ij} = - \lVert x_i-y_j\rVert^2. 
$$


In [None]:
N, M = 10, 3
# N, M = 1000, 3000

x, y = init_twonormals(N, M)


x_i = x[:, None, :]   # (N, 1, 2) torch tensor
y_j = y[None, :, :]   # (1, M, 2) torch tensor
tic = time()
### BEGINNING OF YOUR CODE

Phi_ij = (-(x_i - y_j) ** 2).sum(dim=-1)   # (N, M) tensor of squared distances |x_i-y_j|^2

### END OF YOUR CODE
toc = time()
print(f'Surplus matrix CPU time: {toc-tic:.5f}s')


$$
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
$$
Let $v_j$ be the price at $y_j$. Compute the indirect utility
$$
u_i := \max_j \Phi_{ij}-v_j.
$$
Also retrieve the associated map
$$
T_i := \argmax_j \Phi_{ij}-v_j.
$$

In [None]:

v = 3*y[:,0]
v_j = v[None, :]      # (1, M) 



tic = time()

# Broadcasting and max reduction
### BEGINNING OF YOUR CODE

u, T = (Phi_ij - v_j).max(dim=1)   # (M, )

### END OF YOUR CODE
toc = time()
print(f'Φ-transform CPU time: {toc-tic:.5f}s\n')

plot_pqT(x, y, T)

## e) Example 2: compute a log-sum-exp

When doing regularized OT, the $\Phi$-transform is replaced by **log-sum-exp** operations

$$
u_i = \sigma\ln\Big(\sum_j\exp\Big(\frac{\Phi_{ij}-v_j}{\sigma}\Big)\Big) - \sigma\ln p_i
$$

In [None]:
# Same as before ---------------------------------------------------------------
N, M = 10, 3
# N, M = 10_000, 10_000

x, y = init_twonormals(N, M)
x_i = x[:, None, :]   # (N, 1, 2) torch tensor
y_j = y[None, :, :]   # (1, M, 2) torch tensor
tic = time()

Phi_ij = (-(x_i - y_j) ** 2).sum(dim=-1)   # (N, M) tensor of squared distances |x_i-y_j|^2
toc = time()
print(f'Surplus matrix CPU time: {toc-tic:.5f}s')
# ------------------------------------------------------------------------------

v = 3*y[:,0]
v_j = v[None, :]      # (1, M) 

sigma = 0.1
# sigma = 0.001

tic = time()
# Numerically stabilized
### BEGINNING OF YOUR CODE

u = sigma *((Phi_ij - v_j)/sigma).logsumexp(dim=1) - sigma*np.log(1/N)   # (M, )

### END OF YOUR CODE
toc = time()
print(f'Φ-transform CPU time: {toc-tic:.5f}s\n')

plot_pquv(x,y,u,v)

#### Comparing max and softmax

For a price vector $v$ denote
$$
u^{(0)}_i := \max_j \Phi_{ij}-v_j.
$$
and for $\sigma>0$
$$
u^{(\sigma)}_i = \sigma\ln\Big(\sum_j\exp\Big(\frac{\Phi_{ij}-v_j}{\sigma}\Big)\Big) - \sigma\ln p_i
$$


In [None]:

# Compare max and softmax

v = 3*y[:,0]
v_j = v[None, :]      # (1, M) 

u_0, _ = (Phi_ij - v_j).max(dim=1)   # (M, )

sigma = 0.001
u_sigma = sigma *((Phi_ij - v_j)/sigma).logsumexp(dim=1) - sigma*np.log(1/N)   # (M, )

plt.plot(u_sigma-u_0)
plt.title('Difference $u^{(\sigma)}-u^{(0)}$');

# 2. PyTorch and GPUs

In [None]:

N, M = 1_000, 1_000
# N, M = 100_000, 10_000  # comment CPU code

x, y = init_twonormals(N, M)
v = 3*y[:,0]

# CPU code ---------------------------------------------------------------------
x_i = x[:, None, :]   # (N, 1, 2) torch tensor
y_j = y[None, :, :]   # (1, M, 2) torch tensor
v_j = v[None, :]     # (1, M) 

tic = time()
Phi_ij = (-(x_i - y_j) ** 2).sum(dim=-1)   # (N, M) tensor of squared distances |x_i-y_j|^2
u, T = (Phi_ij - v_j).max(dim=1)   # (M, )
toc = time()
print(f'CPU time: {toc-tic:.5f}s')


# GPU code ---------------------------------------------------------------------
x_i = x[:, None, :].cuda()   # (N, 1, 2) torch tensor
y_j = y[None, :, :].cuda()   # (1, M, 2) torch tensor
v_j = v[None, :].cuda()      # (1, M) 

tic = time()
Phi_ij = (-(x_i - y_j) ** 2).sum(dim=-1)   # (N, M) tensor of squared distances |x_i-y_j|^2
u, T = (Phi_ij - v_j).max(dim=1)   # (M, )
toc = time()
print(f'GPU time: {toc-tic:.5f}s')

## Example: IPFP


Recall the regularized OT problem
$$
\min_{u,v} \sum_j v_jq_j+\sum_iu_ip_i+\sigma\ln\Big(\sum_{ij}\exp\Big(\frac{\Phi_{ij}-u_i-v_j}{\sigma}\Big)\Big)=:F(u,v)
$$

In [None]:
def compute_ipfp(Phi_ij, sigma, numIters):
  N, M = Phi_ij.size()
  v = torch.zeros(M).cuda()
  u = torch.zeros(N).cuda()
  dual_values = np.empty(numIters+1)
  KL_errors = np.empty(numIters+1)

  printBlock = numIters//20 if numIters>20 else 1

  for k in range(numIters+1):
    
    v_j = v[None, :]
    u = sigma * ( (Phi_ij - v_j) / sigma ).logsumexp(axis=1) - sigma*np.log(1/N)
    
    u_i = u[:, None]
    new_v = sigma * ( (Phi_ij -u_i) / sigma ).logsumexp(axis=0) - sigma*np.log(1/M)
    
    # 
    dual_value = (v.sum()/M + u.sum()/N)
    kl_err = 1/sigma * (v-new_v).sum()/M
    #

    v = new_v

    dual_values[k] = dual_value
    KL_errors[k] = kl_err

    if k % printBlock == 0:      
      print(f'Iter {k:4d}/{numIters},   F = {dual_value:6f},   KL err = {kl_err:6f}')

  return v, u, dual_values, KL_errors



In [None]:
N, M = 1_000, 1_000
# N, M = 100_000, 10_000

x, y = init_twonormals(N, M)
x_i = x[:, None, :].cuda()   # (N, 1, 2) torch tensor
y_j = y[None, :, :].cuda()   # (1, M, 2) torch tensor
Phi_ij = (-(x_i - y_j) ** 2).sum(dim=-1)   # (N, M) tensor of squared distances |x_i-y_j|^2
# ------------------------------------------------------------------------------


sigma = 1e-1

numIters = int(3/sigma)

tic = time()
v, u, dual_values, KL_errors = compute_ipfp(Phi_ij, sigma, numIters)
toc = time()
print(f'\nIPFP GPU time: {toc-tic:.5f}s')
plt.semilogy(KL_errors)
plt.xlabel('iteration count')
plt.title('Errors KL(q|$\pi_2$)');

# 3. KeOps

For a large scale computation

$$
u_i := \max_j \Phi_{ij}-v_j.
$$

the GPU runs out of memory when $NM \gtrsim 10^8$ since it cannot store $\Phi_{ij}$. But recall that 
$$\Phi_{ij}=\|x_i-y_i\|^2,$$
so we shouldn't *have to* store it.

KeOps uses *lazy tensors* that can keep $\Phi$ as a *formula*. 

In [None]:

# PyTorch + KeOps

N, M = 100_000, 10_000
# N, M = 1_000_000, 1_000_000

x, y = init_twonormals(N, M)
v = 3*y[:,0]

x_i = LazyTensor(x[:, None, :].cuda())   # (N, 1, 2) torch tensor
y_j = LazyTensor(y[None, :, :].cuda())   # (1, M, 2) torch tensor

# # # # # # # # # 
Phi_ij = (-(x_i - y_j) ** 2).sum(dim=-1)   # (N, M) LAZY tensor of squared distances |x_i-y_j|^2
# # # # # # # # # 


v = -y[:,0]
v_j = LazyTensor(v[:, None].cuda(), axis=1)      # (1, M) 

tic = time()

# Broadcasting and max reduction
u, T = (Phi_ij - v_j).max_argmax(dim=1)   # (M, )

toc = time()
print(f'N = {N}, M = {M}')
print(f'Φ-transform GPU time: {toc-tic:.5f}s')


## Example: large-scale IPFP

In [None]:
def compute_lazy_ipfp(Phi_ij, sigma, numIters):
  N, M = Phi_ij.ni, Phi_ij.nj
  v = torch.zeros(M, 1).cuda()  
  u = torch.zeros(N, 1).cuda()
  dual_values = np.empty(numIters+1)
  KL_errors = np.empty(numIters+1)


  printBlock = numIters//20 if numIters>20 else 1


  for k in range(numIters+1):
    
    v_j = LazyTensor(v, axis=1)   # shape = (1, M)
    u = sigma * ( (Phi_ij - v_j) / sigma ).logsumexp(axis=1) - sigma*np.log(1/N)
    
    u_i = LazyTensor(u, axis=0)
    new_v = sigma * ( (Phi_ij -u_i) / sigma ).logsumexp(axis=0) - sigma*np.log(1/M)
    
    # 
    dual_value = (v.sum()/M + u.sum()/N)
    kl_err = 1/sigma * (v-new_v).sum()/M
    #

    v = new_v

    dual_values[k] = dual_value
    KL_errors[k] = kl_err

    if k % printBlock == 0:      
      print(f'Iter {k:4d}/{numIters},   F = {dual_value:6f},   KL err = {kl_err:6f}')


  return v, u, dual_values, KL_errors

In [None]:
# N, M = 100_000, 10_000
N, M = 1_000_000, 100_000

x, y = init_twonormals(N, M)
x_i = LazyTensor(x[:, None, :].cuda())   # (N, 1, 2) torch tensor
y_j = LazyTensor(y[None, :, :].cuda())   # (1, M, 2) torch tensor
Phi_ij = (-(x_i - y_j) ** 2).sum(dim=-1)   # (N, M) tensor of squared distances |x_i-y_j|^2
# ------------------------------------------------------------------------------


sigma = 1e-1

numIters = int(3/sigma)

print(f'N = {N}, M = {M}\n')
tic = time()
v, u, dual_values, KL_errors = compute_lazy_ipfp(Phi_ij, sigma, numIters)
toc = time()

print(f'\nIPFP GPU time: {toc-tic:.5f}s')
plt.semilogy(KL_errors)
plt.xlabel('iteration count')
plt.title('Errors KL(q|$\pi_2$)');