## Autoencoder
Refs: [DL book](https://www.deeplearningbook.org/contents/autoencoders.html); [inspired by](https://gist.github.com/AFAgarap/4f8a8d8edf352271fa06d85ba0361f26).


In [1]:
import torch       
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from sklearn.decomposition import PCA
from scipy.stats import special_ortho_group
import numpy as np

np.set_printoptions(precision=3, suppress=True)
torch.set_printoptions(precision=3, sci_mode=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# architecture
class shallow_AE(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_layer = nn.Linear(
            in_features=kwargs["input_width"], out_features=kwargs["hidden_width"], bias=kwargs["bias"]
        )
        self.decoder_layer = nn.Linear(
            in_features=kwargs["hidden_width"], out_features=kwargs["input_width"], bias=kwargs["bias"]
        )
        
        if 'linear' in kwargs:
            self.linear = kwargs['linear']
        else:
            self.linear = False

    def forward(self, features):
        activation = self.encoder_layer(features)
        if not self.linear:
            activation = F.relu(activation)
        reconstructed = self.decoder_layer(activation)
        return reconstructed
    
def train(model, epochs, train_loader, optimizer, criterion, verbose=True):
    for epoch in range(epochs):
        loss = 0
        for batch_features in train_loader:
            batch_features = batch_features.to(device)
            optimizer.zero_grad()

            outputs = model(batch_features)
            train_loss = criterion(outputs, batch_features)
            train_loss.backward()
            optimizer.step()
            loss += train_loss.item()

        loss = loss / len(train_loader)
        if verbose:
            print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))
    if verbose:
        print("===================\n")
    return model, loss

In [5]:
def gen_data(**kwargs):
    data0 = torch.randn(kwargs['samples'], kwargs['input_width'])
    
    if ('true_dim' in kwargs) and (kwargs['true_dim']<kwargs['input_width']):
        data0[:,kwargs['true_dim']:] = 0
        data0 = data0 @ special_ortho_group.rvs(kwargs['input_width'])
        data0 = data0.float()
        
    train_loader = torch.utils.data.DataLoader(
        data0, batch_size=kwargs['batch_size']
    )
    
    return data0, train_loader

def PCA_compare(model, model_loss, data0, **kwargs):
    U, s, V = np.linalg.svd(data0, full_matrices=False)

    b=np.append(s[:kwargs['hidden_width']], np.zeros(len(s)-kwargs['hidden_width']))
    L_opt = np.sum(s**2-b**2)*kwargs['batch_size']/kwargs['samples']

    print("Model loss = ", model_loss)
    print("PCA loss = ", L_opt)

#     print("\nModel params:")
#     for n, p in model.named_parameters():
#         print(n, p)

    with torch.no_grad():
        print("model estimate:\n", model(data0.to(device)).detach().cpu().numpy())
        print("\nPCA estimate:\n", U  @ np.diag(b) @ V)

## Linear activation implies that AutoEncoder = PCA

### Underparametrization: hidden_width < true_dim

In [6]:
# hyperparams
params = {
    # data    
    "input_width": 10, 
    "samples": 10000,
    "true_dim":5,
    
    # model
    "hidden_width":3,
    "linear":True,
    "bias": False,
    
    # training
    "lr": 1e-3,
    "batch_size": 100,
    "epochs": 100
}

data0, train_loader = gen_data(**params)
print(data0)

tensor([[ 0.063, -0.075, -1.177,  ..., -0.836, -0.472, -0.921],
        [-1.006,  0.459, -0.354,  ..., -0.596, -0.040,  0.089],
        [-0.846,  0.366, -1.969,  ...,  0.410,  0.420, -1.039],
        ...,
        [-0.295,  0.608, -0.234,  ..., -0.728, -1.632,  0.567],
        [-0.998,  0.617, -1.223,  ...,  1.108,  0.195, -0.486],
        [-0.446,  0.098,  0.699,  ..., -0.715,  0.286,  0.696]])


In [7]:
model = shallow_AE(**params).to(device)
opt = optim.SGD(model.parameters(), lr=params['lr'])
crit = nn.MSELoss(reduction='sum')

model_trained, model_loss = train(model, params["epochs"], train_loader, opt, crit, verbose=False)

PCA_compare(model_trained, model_loss, data0, **params)

Model loss =  194.5485447692871
PCA loss =  194.2754940210744
model estimate:
 [[-0.401  0.006 -1.003 ... -0.194  0.411 -0.803]
 [-0.468  0.349 -0.613 ... -0.172 -0.322 -0.025]
 [-1.057  0.418 -1.838 ... -0.335  0.163 -1.005]
 ...
 [-0.566  0.651 -0.149 ... -0.01  -0.897  0.643]
 [-1.227  0.674 -1.076 ...  0.195 -0.151 -0.451]
 [ 0.449 -0.082  0.283 ... -0.327 -0.384  0.499]]

PCA estimate:
 [[-0.426  0.012 -1.006 ... -0.22   0.411 -0.819]
 [-0.482  0.354 -0.619 ... -0.191 -0.324 -0.027]
 [-1.063  0.418 -1.81  ... -0.397  0.141 -0.988]
 ...
 [-0.588  0.656 -0.16  ...  0.004 -0.878  0.627]
 [-1.219  0.671 -1.05  ...  0.151 -0.173 -0.434]
 [ 0.446 -0.079  0.27  ... -0.314 -0.377  0.501]]


#### Is ReLU better than linear network = PCA?

In [8]:
params["linear"] = False

model = shallow_AE(**params).to(device)
opt = optim.Adam(model.parameters(), lr=params['lr'])
crit = nn.MSELoss(reduction='sum')

model_trained, model_loss = train(model, params["epochs"], train_loader, opt, crit, verbose=True)

PCA_compare(model_trained, model_loss, data0, **params)

epoch : 1/100, loss = 495.942300
epoch : 2/100, loss = 428.648785
epoch : 3/100, loss = 376.046432
epoch : 4/100, loss = 343.476146
epoch : 5/100, loss = 329.040612
epoch : 6/100, loss = 323.652824
epoch : 7/100, loss = 321.606869
epoch : 8/100, loss = 320.774074
epoch : 9/100, loss = 320.367750
epoch : 10/100, loss = 320.165468
epoch : 11/100, loss = 320.046449
epoch : 12/100, loss = 319.963443
epoch : 13/100, loss = 319.896577
epoch : 14/100, loss = 319.837841
epoch : 15/100, loss = 319.786994
epoch : 16/100, loss = 319.736912
epoch : 17/100, loss = 319.688715
epoch : 18/100, loss = 319.640509
epoch : 19/100, loss = 319.595795
epoch : 20/100, loss = 319.552417
epoch : 21/100, loss = 319.513413
epoch : 22/100, loss = 319.479889
epoch : 23/100, loss = 319.448478
epoch : 24/100, loss = 319.420873
epoch : 25/100, loss = 319.393067
epoch : 26/100, loss = 319.361754
epoch : 27/100, loss = 319.331929
epoch : 28/100, loss = 319.300603
epoch : 29/100, loss = 319.268758
epoch : 30/100, loss = 

### Overparametrization: hidden_width > true_dim

In [9]:
params['true_dim'] = params['hidden_width']-1
params['linear'] = True

data0, train_loader = gen_data(**params)
print(data0)

tensor([[ 0.086, -0.022,  0.008,  ...,  0.061,  0.096, -0.016],
        [-0.564,  0.570,  0.309,  ...,  1.265,  0.166,  0.295],
        [-0.062, -0.014, -0.031,  ..., -0.161, -0.125, -0.002],
        ...,
        [-0.023,  0.216,  0.176,  ...,  0.802,  0.367,  0.098],
        [ 0.838, -0.224,  0.069,  ...,  0.552,  0.919, -0.162],
        [ 0.456, -0.407, -0.204,  ..., -0.811, -0.033, -0.215]])


In [10]:
model = shallow_AE(**params).to(device)
opt = optim.SGD(model.parameters(), lr=params['lr'])
crit = nn.MSELoss(reduction='sum')

model_trained, model_loss = train(model, params["epochs"], train_loader, opt, crit, verbose=False)

model_loss
PCA_compare(model_trained, model_loss, data0, **params)

Model loss =  2.149715013204795e-12
PCA loss =  6.664752785130093e-06
model estimate:
 [[ 0.086 -0.022  0.008 ...  0.061  0.096 -0.016]
 [-0.564  0.57   0.309 ...  1.265  0.166  0.295]
 [-0.062 -0.014 -0.031 ... -0.161 -0.125 -0.002]
 ...
 [-0.023  0.216  0.176 ...  0.802  0.367  0.098]
 [ 0.838 -0.224  0.069 ...  0.552  0.919 -0.162]
 [ 0.456 -0.407 -0.204 ... -0.811 -0.033 -0.215]]

PCA estimate:
 [[ 0.086 -0.022  0.008 ...  0.061  0.096 -0.016]
 [-0.564  0.57   0.309 ...  1.265  0.166  0.295]
 [-0.062 -0.014 -0.031 ... -0.161 -0.125 -0.002]
 ...
 [-0.023  0.216  0.176 ...  0.802  0.367  0.098]
 [ 0.838 -0.224  0.069 ...  0.552  0.919 -0.162]
 [ 0.456 -0.407 -0.204 ... -0.811 -0.033 -0.215]]


In [14]:
params['linear'] = False

model = shallow_AE(**params).to(device)
opt = optim.SGD(model.parameters(), lr=params['lr'])
crit = nn.MSELoss(reduction='sum')

model_trained, model_loss = train(model, params["epochs"], train_loader, opt, crit, verbose=True)

model_loss
PCA_compare(model_trained, model_loss, data0, **params)

epoch : 1/100, loss = 74.141880
epoch : 2/100, loss = 32.560665
epoch : 3/100, loss = 32.535238
epoch : 4/100, loss = 32.538647
epoch : 5/100, loss = 32.503089
epoch : 6/100, loss = 32.470490
epoch : 7/100, loss = 32.461829
epoch : 8/100, loss = 32.394977
epoch : 9/100, loss = 32.324835
epoch : 10/100, loss = 32.283395
epoch : 11/100, loss = 32.229084
epoch : 12/100, loss = 32.185357
epoch : 13/100, loss = 32.153639
epoch : 14/100, loss = 32.119113
epoch : 15/100, loss = 32.088415
epoch : 16/100, loss = 32.054099
epoch : 17/100, loss = 32.000260
epoch : 18/100, loss = 31.951956
epoch : 19/100, loss = 31.910305
epoch : 20/100, loss = 31.866338
epoch : 21/100, loss = 31.858989
epoch : 22/100, loss = 31.866579
epoch : 23/100, loss = 31.857804
epoch : 24/100, loss = 31.840992
epoch : 25/100, loss = 31.832151
epoch : 26/100, loss = 31.830606
epoch : 27/100, loss = 31.834229
epoch : 28/100, loss = 31.834116
epoch : 29/100, loss = 31.834182
epoch : 30/100, loss = 31.834183
epoch : 31/100, los