# Program to solve the MNIST classification problem using CPD

In [1]:
import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
from torch.utils import data
import torchvision
from torchvision import transforms

import math
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

import torch.multiprocessing as mp

# Use GPU if available
use_cuda = torch.cuda.is_available()
#use_cuda = False # Run on cpu even if cuda is available
device = torch.device("cuda:0" if use_cuda else "cpu")
print("Cuda is available: ",torch.cuda.is_available(),"\nUse cuda: ",use_cuda)

# PCA of the training set
perform_pca = True

Cuda is available:  True 
Use cuda:  True


# Loading MNIST

In [2]:
#To speed up training we'll only work on a subset of the data

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state

# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

# Performing PCA

In [3]:
if perform_pca:
    from scipy.linalg import svd

    X = X.astype('float32')
    N, M = X.shape

    # Std data
    Y = X - np.ones((N,1))*X.mean(axis=0)
    Y /= 255.
    ## Y = Y[:,np.std(Y,0) != 0]
    ## Y = Y*(1/np.std(Y,0))

    # PCA
    U,S,V = svd(Y,full_matrices=False)
    rho = (S*S) / (S*S).sum()
    Z = Y@V # matrix multiplication
    Z = Z[:,:15]

# Create train, val & test

In [4]:
# Specify the size of the training, cross validation and test set
train_samples = 1024 #12800  #1024
cv_samples    = 512 #1024 #512
test_samples  = 512 #1024 #512

y = y.astype('int32')

if perform_pca:
    ### Use output from PCA ###
    random_state = check_random_state(0)
    permutation = random_state.permutation(X.shape[0])
    Z = Z[permutation]
    y = y[permutation]
    Z = Z.reshape((X.shape[0], -1))

    x_train, X_test, targets_train, y_test = train_test_split(
        Z, y, train_size=train_samples, test_size= (cv_samples+test_samples) )

    x_valid, x_test, targets_valid, targets_test = train_test_split(
        X_test, y_test, train_size=cv_samples, test_size=test_samples)
else:
    ### no PCA, use raw data ###
    # Split in training, testing and validation set
    x_train, X_test, targets_train, y_test = train_test_split(
        X, y, train_size=train_samples, test_size=(cv_samples+test_samples))

    x_valid, x_test, targets_valid, targets_test = train_test_split(
        X_test, y_test, train_size=cv_samples, test_size=test_samples)

num_features = x_train.shape[1]

print("Information on dataset")
print("x_train", x_train.shape)
print("targets_train", targets_train.shape)
print("x_valid", x_valid.shape)
print("targets_valid", targets_valid.shape)
print("x_test", x_test.shape)
print("targets_test", targets_test.shape)
print('num_features: ',num_features)

Information on dataset
x_train (1024, 15)
targets_train (1024,)
x_valid (512, 15)
targets_valid (512,)
x_test (512, 15)
targets_test (512,)
num_features:  15


# Hyperparameters

In [5]:
rank = 7 # Rank of the full decomposed tensor
poly_order = 3

# Vandermonde vectors

In [6]:
def vandermonde_vec(dataset, num_instances, num_features):
    u = np.zeros((num_instances,num_features,poly_order))
    # u is a 3-dimensional tensor, which contains the Vandermonde vectors for every feature
    # and every training point
    
    # Get powers
    for i in range(num_instances):
        for j in range(num_features):
            for k in range(poly_order):
                u[i,j,k] = np.power([dataset[i,j]], k)
    
    return u

x_train = vandermonde_vec(x_train, x_train.shape[0], num_features)
x_valid = vandermonde_vec(x_valid, x_valid.shape[0], num_features)
x_test = vandermonde_vec(x_test, x_test.shape[0], num_features)

x_train = torch.FloatTensor(x_train)
x_valid = torch.FloatTensor(x_valid)
x_test = torch.FloatTensor(x_test)

## Data Generator

In [7]:
batch_size = 64 # used by the generators and given as input to net()
# Parameters for the generator
if use_cuda:
    params = {'batch_size': batch_size,
              'shuffle': True,
              'num_workers': 8,
              'drop_last': True,
              'pin_memory': True}
else:
    params = {'batch_size': batch_size,
              'shuffle': True,
              'num_workers': 8,
              'drop_last': True}

# Using Dataset class
class Dataset(data.Dataset):
    def __init__(self, features, targets):
    # Initialise data to be used by the other functions
        self.features = features
        self.targets = targets
    
    def __len__(self):
    # Length of the dataset, i.e. number of samples
        return len(self.targets)
    
    def __getitem__(self, index):
    # Returns sample from the dataset
        x = torch.FloatTensor(self.features[index])
        y = self.targets[index]
        
        return x, y
    
# Apply generators to the data
training_set = Dataset(x_train, targets_train)
training_generator = data.DataLoader(training_set, **params)

validation_set = Dataset(x_valid, targets_valid)
validation_generator = data.DataLoader(validation_set, **params)

test_set = Dataset(x_test, targets_test)
test_generator = data.DataLoader(test_set, **params)

# Model

We have the following parameters:

- $ n $: number of outputs
- $ p $: dimension of vandermonde vectors
- $ D $: rank of decomposed tensor

We want to do the function contraction

$ f = \sum_i^D \textbf{U}_i  \, \in \mathbb{R}^n $

Where:

$ \textbf{U} = \textbf{M}_1 \odot \textbf{M}_2 \odot ... \odot \textbf{M}_d  \, \in \mathbb{R}^{D \times n} $

$ \odot $ is the elementwise vector multiplication and

$ \textbf{M}_i = (\textbf{v}^{(i)})^T \mathcal{A}_i^{p \times D \times n}
    \, \in \mathbb{R}^{D \times n} $

In [8]:
# Number of output classes
num_classes = 10

# define network
class Net(nn.Module):

    def __init__(self, num_features, poly_order, num_output, rank):
        super(Net, self).__init__()  
        
        # weight tensors collected in one tensor
        tn_size = tuple([num_features] + [poly_order] + [rank] + [num_output]) # size of all tensors A_i
        self.A = Parameter(init.normal_(torch.empty(tn_size, requires_grad=True), std=0.575))

    def forward(self, vec_input, batch_size, print_expr=False):
        
        m = torch.einsum('abcd,eab->aced',self.A ,vec_input)
        f = torch.prod(m,0)
        
        return torch.sum(f,0)

In [9]:
net = Net(num_features, poly_order, num_classes, rank).to(device)

optimizer = optim.Adam(net.parameters())
criterion = nn.CrossEntropyLoss()

# Build the training loop

In [10]:
# Training the model
# we could have done this ourselves,
# but we should be aware of sklearn and it's tools
from sklearn.metrics import accuracy_score

# setting hyperparameters and gettings epoch sizes
#batch_size = 64 this is set by the generators
num_epochs = 1000

#if run_as_dist:
#    dist.init_process_group("gloo", rank=rank, world_size=size)

# setting up lists for handling loss/accuracy
train_acc, train_loss = [], []
valid_acc, valid_loss = [], []
test_acc, test_loss = [], []
cur_loss = 0
losses = []

for epoch in range(num_epochs):
    # Forward -> Backprob -> Update params
    ## Train
    net.train()
    cur_loss = 0
    count = 0
    for data, labels in training_generator:
        count += 1
        # Transfer training data and targets to device
        data = data.to(device)
        target_batch = Variable(labels.long()).to(device)
        
        # Send it through the model
        output = net(data, batch_size)
          
        # compute gradients given loss
        batch_loss = criterion(output, target_batch)
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        
        #if count % 25 == 0:
        #    print(next(net.parameters()))
        
        cur_loss += batch_loss   
        
        # Manually release memory
        try:
            del data, labels, target_batch, output, batch_loss 
            torch.cuda.empty_cache()
        except:
            print("Memory not released in training.")
       
    losses.append(cur_loss / batch_size)

    net.eval()
    ### Evaluate training
    train_preds, train_targs = [], []
    for data, labels in training_generator:
        data = data.to(device)
        
        output = net(data, batch_size)
        preds = torch.max(output, 1)[1]
        
        train_targs += list(labels)
        train_preds += list(preds.data.cpu().numpy())
        
        # Manually release memory
        try:
            del data, labels, output, preds 
            torch.cuda.empty_cache()
        except:
            print("Memory not released in training evaluation.")
    
    ### Evaluate validation
    val_preds, val_targs = [], []
    for data, labels in validation_generator:
        data = data.to(device)
        
        output = net(data, batch_size)
        preds = torch.max(output, 1)[1]
        val_preds += list(preds.data.cpu().numpy())
        val_targs += list(labels)
        
        # Manually release memory
        try:
            del data, labels, output, preds
            torch.cuda.empty_cache()
        except:
            print("Memory not released in validation.")
        
    train_acc_cur = accuracy_score(train_targs, train_preds)
    valid_acc_cur = accuracy_score(val_targs, val_preds)
    
    train_acc.append(train_acc_cur)
    valid_acc.append(valid_acc_cur)
    
    if epoch % 10 == 0:
        print("Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f" % (
                epoch+1, losses[-1], train_acc_cur, valid_acc_cur))
        print("CUDA memory usage in Gb: ",torch.cuda.memory_allocated()*1000**(-3))
        print('Norm G\'s',torch.norm(next(net.parameters()),p='fro'))
        #print(next(net.parameters()))
        
#if run_as_dist:
#    dist.reduce(x, dst=0)
        
epoch = np.arange(len(train_acc))
plt.figure()
plt.plot(epoch, train_acc, 'r', epoch, valid_acc, 'b')
plt.legend(['Train Accucary','Validation Accuracy'])
plt.xlabel('Updates'), plt.ylabel('Acc')

Epoch  1 : Train Loss 0.575645 , Train acc 0.138672, Valid acc 0.119141
CUDA memory usage in Gb:  3.584e-05
Norm G's tensor(26.6752, device='cuda:0', grad_fn=<NormBackward0>)
Epoch 11 : Train Loss 0.575169 , Train acc 0.184570, Valid acc 0.187500
CUDA memory usage in Gb:  4.096e-05
Norm G's tensor(28.1780, device='cuda:0', grad_fn=<NormBackward0>)
Epoch 21 : Train Loss 0.542227 , Train acc 0.207031, Valid acc 0.214844
CUDA memory usage in Gb:  4.6080000000000006e-05
Norm G's tensor(32.3354, device='cuda:0', grad_fn=<NormBackward0>)
Epoch 31 : Train Loss 0.526144 , Train acc 0.217773, Valid acc 0.232422
CUDA memory usage in Gb:  5.1200000000000004e-05
Norm G's tensor(34.5144, device='cuda:0', grad_fn=<NormBackward0>)
Epoch 41 : Train Loss 0.518911 , Train acc 0.232422, Valid acc 0.236328
CUDA memory usage in Gb:  5.632e-05
Norm G's tensor(35.8878, device='cuda:0', grad_fn=<NormBackward0>)
Epoch 51 : Train Loss 0.512537 , Train acc 0.239258, Valid acc 0.246094
CUDA memory usage in Gb:  6

KeyboardInterrupt: 

In [11]:
for G in net.parameters():
    print(G)

Parameter containing:
tensor([[[[ 3.3556e-02, -8.2192e-01, -9.1985e-01,  ..., -1.6071e-01,
            4.7410e-01,  7.9224e-01],
          [-8.8718e-01,  1.2570e+00,  1.7975e-01,  ...,  9.0794e-01,
           -1.0174e+00, -2.3983e+00],
          [ 6.2381e-01,  3.9675e-02, -9.3445e-01,  ...,  1.3172e+00,
            6.4101e-01,  1.0568e+00],
          ...,
          [ 1.6745e+00, -1.4628e+00, -8.0604e-01,  ..., -1.4626e+00,
           -5.5019e-01,  1.5910e+00],
          [ 2.4761e+00,  1.6407e-01, -1.3961e-01,  ..., -6.8486e-01,
            1.7981e-01,  1.2158e+00],
          [ 2.6932e-01,  1.2528e+00,  1.0200e+00,  ...,  2.3375e-02,
           -7.0846e-01, -3.7548e-01]],

         [[ 9.3180e-01,  6.3440e-01, -1.8776e-01,  ..., -9.3470e-01,
           -3.4082e-01, -3.5804e-01],
          [-1.8652e+00, -6.1778e-01, -9.5931e-01,  ...,  2.9813e-01,
            8.2676e-01, -5.9309e-01],
          [ 1.9158e+00, -3.5770e-01, -1.3111e+00,  ...,  1.1865e-01,
            1.7118e-02,  1.8258e-03]