In [1]:
# Import libraries the application needs
from __future__ import print_function
from itertools import count
import torch
import torch.autograd
import torch.nn.functional as F

In [2]:
# Import libraries needed to use the accelerator
import pynq.lib.dma # For using the DMA
from pynq import Xlnk # Used for allocating contiguous arrays
import numpy as np # Xlnk uses numpy arrays
from pynq import Overlay # Used to download the bitstream
import struct
from pynq import DefaultIP # Used for AXI-Lite class

In [3]:
class ReturnValuesDriver(DefaultIP):
    # The following class is used to retrieve data from the AXI-Lite interface
    
    def __init__(self, description):
        super().__init__(description=description)

    bindto = ['xilinx.com:hls:backward_lite:1.0'] # The class will be associated to the backward_lite IP in Vivado

    """The following functions represent each variable that will be returned. Simply calling these will allow us to obtain the 
     output. We use the unpack function from the struct package to be able to read floating-point numbers as by default they
     are read as integers. The addresses from where we read (i.e. 0x10, 0x18) are taken from Vivado HLS."""
    @property
    def bias(self):
        return struct.unpack("f", struct.pack("I", self.read(0x10)))[0]
    @property
    def w1(self):
        return struct.unpack("f", struct.pack("I", self.read(0x18)))[0]
    @property
    def w2(self):
        return struct.unpack("f", struct.pack("I", self.read(0x20)))[0]
    @property
    def w3(self):
        return struct.unpack("f", struct.pack("I", self.read(0x28)))[0]
    @property
    def w4(self):
        return struct.unpack("f", struct.pack("I", self.read(0x30)))[0]
    @property
    def w5(self):
        return struct.unpack("f", struct.pack("I", self.read(0x38)))[0]

In [4]:
# The network will learn a polynomial, for which the following are taken from the example itself unless stated
POLY_DEGREE = 5
W_target = torch.randn(POLY_DEGREE, 1) * 5
b_target = torch.randn(1) * 5

def make_features(x):
    """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4]."""
    # This function is one of the kernels which are executed in hardware.
    
    # We first perform some pre-processing
    x = x.unsqueeze(1)
    in_buffer[:] = x.data.numpy()[:]
    
    # Then, we must provide the data to the hardware through the DMA
    dma2.sendchannel.transfer(in_buffer) # providing the inputs
    dma2.recvchannel.transfer(out_buffer) # providing the contiguous array to where the output should be placed
    dma2.sendchannel.wait() # Wait until the last flag is set high.
    dma2.recvchannel.wait()
    return torch.tensor(out_buffer) # Some post-processing to convert the numpy array to a tensor


def f(x):
    """Approximated function."""
    return x.mm(W_target) + b_target.item()

def poly_desc(W, b):
    """Creates a string description of a polynomial."""
    result = 'y = '
    for i, w in enumerate(W):
        result += '{:+.2f} x^{} '.format(w, len(W) - i)
    result += '{:+.2f}'.format(b[0])
    return result

def get_batch(batch_size=32):
    """Builds a batch i.e. (x, f(x)) pair."""
    random = torch.randn(batch_size)
    x = make_features(random)
    y = f(x)
    return x, y

In [5]:
overlay = Overlay('/home/xilinx/Regression/backward_lite_features.bit') # Download the bitstream onto the FPGA

# In this accelerator, we are accelerating two kernels and each has its own DMA which are assigned here:
dma1 = overlay.axi_dma_0 # Backward
dma2 = overlay.axi_dma_1 # Equation Matrix

# Since this IP uses AXI-Lite for the output, we can associate that to a variable and then use our class defined above through 
# this
backward_ip = overlay.backward_lite_0

xlnk = Xlnk() # Used for allocation
# Allocating the contiguous arrays of a fixed size:
in_stream = xlnk.cma_array(shape=(32+32+(32*5),1), dtype=np.float32)
in_buffer = xlnk.cma_array(shape=(32,1), dtype=np.float32)
out_buffer = xlnk.cma_array(shape=(32,5), dtype=np.float32)

In [7]:
fc = torch.nn.Linear(W_target.size(0), 1)

for batch_idx in count(1):
    # Get data
    batch_x, batch_y = get_batch()

    myVar = fc(batch_x)
    # Forward pass
    output = F.smooth_l1_loss(myVar, batch_y)
    loss = output.item()

    # Pre-processing for the other accelerator
    batch_x_stream = batch_x.t()
    batch_x_stream = batch_x_stream.reshape(32*5, 1)
    in_stream[:] = torch.cat((myVar.data, batch_y.data, batch_x_stream.data), 0).numpy()[:]
    
    # Transfer data to the DMA
    dma1.sendchannel.transfer(in_stream)
    dma1.sendchannel.wait()
        
    # Obtaining the output from the AXI-Lite interface, as well as post-processing
    bias_grad = torch.tensor([backward_ip.bias])
    weight_grad = torch.tensor([[backward_ip.w1, backward_ip.w2, backward_ip.w3, backward_ip.w4, backward_ip.w5]])
        
    #print(weight_grad, bias_grad)
    flag = True
    for param in fc.parameters():
        if (flag):
            param.data.add_(-0.1 * weight_grad)
            flag = False
        else:
            param.data.add_(-0.1 * bias_grad)

    # Stop criterion
    if loss < 1e-3:
        break

print('Loss: {:.6f} after {} batches'.format(loss, batch_idx))
print('==> Learned function:\t' + poly_desc(fc.weight.view(-1), fc.bias))
print('==> Actual function:\t' + poly_desc(W_target.view(-1), b_target))

Loss: 0.000680 after 12876 batches
==> Learned function:	y = -2.41 x^5 +0.11 x^4 -3.11 x^3 -1.80 x^2 -0.66 x^1 -1.70
==> Actual function:	y = -2.39 x^5 +0.19 x^4 -3.10 x^3 -1.84 x^2 -0.66 x^1 -1.70
