In [1]:
import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
import torch.utils as tutils
import torchvision
from torchvision import transforms

from sklearn.metrics import accuracy_score # used in training

import math
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

import torch.multiprocessing as mp

# Use GPU
use_cuda = torch.cuda.is_available()
#use_cuda = False # Set to false, if the code should be run on the cpu even if cuda is available
device = torch.device("cuda:0" if use_cuda else "cpu")
print("Cuda is available: ",torch.cuda.is_available(),"\nUse cuda: ",use_cuda)

# PCA of the training set
perform_pca = True
n_PCA = 5 # number of PCA components

# Specify the size of the training, cross validation and test set
train_samples = 1024 #12800
cv_samples    = 512 #1024
test_samples  = 512 #1024

# Number of epochs
num_epochs = 1000

# used by the generators and given as input to net()
batch_size = 64

# size of the Vandermonde vectors
poly_order = 2 # Use integer for same dimension across all features
#poly_order = [3,2,3,2,3,2,3,2,3,2] # Use list for dissimilar dimensions
#poly_order_is_list = True # true of the polynomial order is given as a list of ints

core_dim = 3 # size of the outer dimensions (mode-1 and mode-3) of the cores in the TR

num_classes = 10 # corresponding to the number of digits in NMIST

Cuda is available:  True 
Use cuda:  True


## Tensor ring / tensor chain

Classification of the MNIST dataset using the tensorring decompsition as the model structure. It is based on MNIST.ipynb

The tensor ring or tesnor chain is the same as the tensor train except the all the core tensors have the same dimensions and the 1st and Nth core tensors are also connected as illustrated in the figure.

![title](ji_2019_fig26.png)

Equation (11) may therefore still be used to compute f. Execpt now all G's have the same dimension.

The result is then obtained by 

\begin{equation}
    f(x) = [\mathcal{G_1} \times_2 \nu(x_1)] \times_{3,1} [\mathcal{G_2} \times_2 \nu(x_2)] \times_{3,1} [\mathcal{G_3} \times_2 \nu(x_3)] \times_{3,1} \mathcal{G_4} \times_{3,1} \mathcal{G_1}
\end{equation}

In the case of three-dimensional input (num_features) $x = (x_1,x_2,x_3)^T$ 

In [2]:
## Load MNIST
#To speed up training we'll only work on a subset of the data

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state

# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

# PCA

In [3]:
if perform_pca:
    from scipy.linalg import svd

    X = X.astype('float32')
    N, M = X.shape

    # Std data
    Y = X - np.ones((N,1))*X.mean(axis=0)
    Y /= 255.
    ## Y = X - np.ones((N,1))*X.mean(axis=0)
    ## Y = Y[:,np.std(Y,0) != 0]
    ## Y = Y*(1/np.std(Y,0))

    # PCA
    U,S,V = svd(Y,full_matrices=False)
    rho = (S*S) / (S*S).sum()
    Z = Y@V # matrix multiplication
    Z = Z[:,:n_PCA]

# Create train, val & test

In [4]:
y = y.astype('int32')

if perform_pca:
    ### Use output from PCA ###
    random_state = check_random_state(0)
    permutation = random_state.permutation(X.shape[0])
    Z = Z[permutation]
    y = y[permutation]
    Z = Z.reshape((X.shape[0], -1))

    x_train, X_test, targets_train, y_test = train_test_split(
        Z, y, train_size=train_samples, test_size= (cv_samples+test_samples) )

    x_valid, x_test, targets_valid, targets_test = train_test_split(
        X_test, y_test, train_size=cv_samples, test_size=test_samples)
else:
    ### no PCA, use raw data ###
    # Split in training, testing and validation set
    x_train, X_test, targets_train, y_test = train_test_split(
        X, y, train_size=train_samples, test_size=(cv_samples+test_samples))

    x_valid, x_test, targets_valid, targets_test = train_test_split(
        X_test, y_test, train_size=cv_samples, test_size=test_samples)

num_features = x_train.shape[1]

print("Information on dataset")
print("x_train", x_train.shape)
print("targets_train", targets_train.shape)
print("x_valid", x_valid.shape)
print("targets_valid", targets_valid.shape)
print("x_test", x_test.shape)
print("targets_test", targets_test.shape)
print('num_features: ',num_features)

Information on dataset
x_train (1024, 5)
targets_train (1024,)
x_valid (512, 5)
targets_valid (512,)
x_test (512, 5)
targets_test (512,)
num_features:  5


In [5]:
def vandermonde_vec(dataset, num_instances, num_features, poly_order, poly_order_is_list=False):
    # Function for generating the Vandermonde vectors as a three-way array
    
    # The poly_order is the same across all the features
    if poly_order_is_list == False:
        u = np.zeros((num_instances,num_features,poly_order))
        # u is a 3-dimensional tensor, which contains the Vandermonde vectors for every feature
        # and every training point

        # Get powers
        for i in range(num_instances):
            for j in range(num_features):
                for k in range(poly_order):
                    u[i,j,k] = np.power([dataset[i,j]], k)
    
    # poly_order is a list of numbers with the polynomial orders for the features
    else:        
        # The size of u depends on the largest poly_order
        u = np.zeros((num_instances,num_features,max(poly_order)))
        
        # Get powers
        for i in range(num_instances):
            for j in range(num_features):
                for k in range(max(poly_order)):
                    # Raise the feature to the poly_order or insert 0
                    u[i,j,k] = np.power([dataset[i,j]], k) if poly_order[j] > (k) else 0
                     
    #u = u.reshape(num_instances,num_features,poly_order)
    return u

## Data Generator

In [6]:
# Parameters for the generator
if use_cuda:
    params = {'batch_size': batch_size,
              'shuffle': True,
              'num_workers': 8,
              'drop_last': True,
              'pin_memory': True}
else:
    params = {'batch_size': batch_size,
              'shuffle': True,
              'num_workers': 8,
              'drop_last': True}

# Using Dataset class
class Dataset(tutils.data.Dataset):
    def __init__(self, features, targets):
    # Initialise data to be used by the other functions
        self.features = features
        self.targets = targets
    
    def __len__(self):
    # Length of the dataset, i.e. number of samples
        return len(self.targets)
    
    def __getitem__(self, index):
    # Returns sample from the dataset
        x = torch.FloatTensor(self.features[index])
        y = self.targets[index]
        
        return x, y

## Model

In [7]:
# define network
class Net(nn.Module):

    def __init__(self, num_features, poly_order, num_output, outer_dim, poly_order_is_list=False):
        super(Net, self).__init__()
        if poly_order_is_list:
            poly_list = poly_order
            poly_order = max(poly_order)
        
        self.num_features = num_features
        self.poly_order = poly_order
        #outer_dim = 3 # outer dimensions of the cores, G
        
        # Get the dimensions of the core tensors
        # Here, they are all of the same size except the last core
        gn_size = tuple([outer_dim,num_output,outer_dim]) # Dimension of the last cores tensor
        gstack_size = tuple([outer_dim,poly_order,outer_dim,num_features]) # Dimension of the stack of cores
        
        # Elements are drawn from a uniform distribution
        bound_i = 1/math.sqrt(outer_dim)
        # bounds on the uniform distribution
        lb = 0.1*bound_i
        ub = 1.0*bound_i
        
        # The cores are now combined to give one long dimension which matched the one from vandermonde
        self.Gstack = Parameter(init.uniform_(torch.empty(gstack_size, requires_grad=True),a=lb,b=ub))
        
        # The last tensor as a different size as the inner dimension is the number of classes
        self.GN = Parameter(init.uniform_(torch.empty(gn_size, requires_grad=True),a=lb,b=ub))
        
        print('Stacked G\'s: ', self.Gstack.shape,'\nGN: ',self.GN.shape)
        
        # replace some parameters by zeros if the poly_order is not the same for all cores
        if poly_order_is_list:
            for j in range(num_features):
                if poly_list[j] < (poly_order):
                    self.Gstack[:,poly_list[j]:poly_order,:,j] = 0.0
            self.Gstack=Parameter(self.Gstack)

    def forward(self, tensor_input, batch_size, print_expr=False):
       # Multiplication of Vandermonde vectors
        Gv_stack= torch.einsum('abcd, edb -> aecd',self.Gstack,tensor_input)
      
        # Multiplication of the cores (contraction) to get f: G_i-1 x_31 G_i
        f_stack = Gv_stack[:,:,:,0]
        # The multiplication by the last core and the ring multiplication is not in the for-loop
        for i in range(1,self.num_features):
            f_stack = torch.einsum('abc, cbe -> abe',f_stack,Gv_stack[:,:,:,i])
        
        # Multiplication of the last core
        f_stack = torch.einsum('abc, cda -> bd',f_stack, self.GN)

        try:
            del Gv_stack
            torch.cuda.empty_cache()
        except:
            print("Memory not released in forward pass.")
            
        #return f_stack, which should now have dimension batch_size,num_classes
        return f_stack

In [8]:
# Transform input data
num_features = x_train.shape[1]

print("train before vandermonde",x_train.shape)

x_train = vandermonde_vec(x_train, x_train.shape[0], num_features, poly_order)
x_valid = vandermonde_vec(x_valid, x_valid.shape[0], num_features, poly_order)
x_test = vandermonde_vec(x_test, x_test.shape[0], num_features, poly_order)

print("train after vandermonde",x_train.shape)

# Apply generators to the data
training_set = Dataset(x_train, targets_train)
training_generator = tutils.data.DataLoader(training_set, **params)

validation_set = Dataset(x_valid, targets_valid)
validation_generator = tutils.data.DataLoader(validation_set, **params)

test_set = Dataset(x_test, targets_test)
test_generator = tutils.data.DataLoader(test_set, **params)

# Initialise model
net = Net(num_features, poly_order, num_classes, core_dim).to(device)

#optimizer = optim.Adam(net.parameters())
# Optimizer with l2 regularisation
#optimizer = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)
#optimizer = optim.LBFGS(net.parameters())
optimizer = optim.LBFGS(model.parameters())
criterion = nn.CrossEntropyLoss()

print('Norm G\'s',torch.norm(next(net.parameters()),p='fro'))

### Training the model ###
# setting up lists for handling loss/accuracy
train_acc, train_loss = [], []
valid_acc, valid_loss = [], []
test_acc, test_loss = [], []
cur_loss = 0
losses = []

for epoch in range(num_epochs):
    # Forward -> Backprob -> Update params
    ## Train
    net.train()
    cur_loss = 0
    count = 0
    for data, labels in training_generator:
        count += 1
        # Transfer training data and targets to device
        data = data.to(device)
        target_batch = Variable(labels.long()).to(device)

        # Send it through the model
        output = net(data, batch_size)

        # compute gradients given loss
        batch_loss = criterion(output, target_batch)
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        cur_loss += batch_loss   

        # Manually release memory
        try:
            del data, labels, target_batch, output, batch_loss 
            torch.cuda.empty_cache()
        except:
            print("Memory not released in training.")

    losses.append(cur_loss / batch_size)

    net.eval()
    ### Evaluate training
    train_preds, train_targs = [], []
    for data, labels in training_generator:
        data = data.to(device)

        output = net(data, batch_size)
        preds = torch.max(output, 1)[1]

        train_targs += list(labels)
        train_preds += list(preds.data.cpu().numpy())

        # Manually release memory
        try:
            del data, labels, output, preds 
            torch.cuda.empty_cache()
        except:
            print("Memory not released in training evaluation.")

    ### Evaluate validation
    val_preds, val_targs = [], []
    for data, labels in validation_generator:
        data = data.to(device)

        output = net(data, batch_size)
        preds = torch.max(output, 1)[1]
        val_preds += list(preds.data.cpu().numpy())
        val_targs += list(labels)

        # Manually release memory
        try:
            del data, labels, output, preds
            torch.cuda.empty_cache()
        except:
            print("Memory not released in validation.")

    train_acc_cur = accuracy_score(train_targs, train_preds)
    valid_acc_cur = accuracy_score(val_targs, val_preds)

    train_acc.append(train_acc_cur)
    valid_acc.append(valid_acc_cur)

    if epoch % 10 == 0:
        print("Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f" % (
                epoch+1, losses[-1], train_acc_cur, valid_acc_cur))
        print("CUDA memory usage in Gb: ",torch.cuda.memory_allocated()*1000**(-3))
        print('Norm G\'s',torch.norm(next(net.parameters()),p='fro'))

train before vandermonde (1024, 5)
train after vandermonde (1024, 5, 2)
Stacked G's:  torch.Size([3, 2, 3, 5]) 
GN:  torch.Size([3, 10, 3])


RuntimeError: CUDA error: out of memory

In [None]:
# Print the parameters of the model
i=0
for G in net.parameters():
    if i == 0:
        G_net = G
    elif i == 1:
        GN_net = G
    i+=1
    
f_net = G_net[:,:,:,0]
for i in range(1,num_features):
    f_net = torch.einsum('abc, cbe -> abe',f_net,G_net[:,:,:,i])

f_net = torch.einsum('abc, cda -> bd',f_net, GN_net)

print('Norm G\'s',torch.norm(G_net,p='fro'))
print('f shape: ',f_net.shape,'\nf: ',f_net)