## PYNQ test hls4ml NN IP

In [1]:
from pynq import Overlay
import numpy as np

## Load bitfile (overlay)

In [2]:
overlay = Overlay("./nn_axis.bit")

## Inspect the structure of the overlay
To pretty-print and reduce elements of the json output, copy-paste the code [here](https://jsonformatter.curiousconcept.com/)

In [3]:
if overlay.is_loaded():
    print(overlay.ip_dict)

{'axi_dma_0': {'fullpath': 'axi_dma_0', 'type': 'xilinx.com:ip:axi_dma:7.1', 'state': None, 'addr_range': 65536, 'phys_addr': 1077936128, 'mem_id': 'S_AXI_LITE', 'gpio': {}, 'interrupts': {'mm2s_introut': {'controller': 'axi_intc_0', 'index': 0, 'fullpath': 'axi_dma_0/mm2s_introut'}, 's2mm_introut': {'controller': 'axi_intc_0', 'index': 1, 'fullpath': 'axi_dma_0/s2mm_introut'}}, 'parameters': {'C_S_AXI_LITE_ADDR_WIDTH': '10', 'C_S_AXI_LITE_DATA_WIDTH': '32', 'C_DLYTMR_RESOLUTION': '125', 'C_PRMRY_IS_ACLK_ASYNC': '0', 'C_ENABLE_MULTI_CHANNEL': '0', 'C_NUM_MM2S_CHANNELS': '1', 'C_NUM_S2MM_CHANNELS': '1', 'C_INCLUDE_SG': '1', 'C_SG_INCLUDE_STSCNTRL_STRM': '0', 'C_SG_USE_STSAPP_LENGTH': '0', 'C_SG_LENGTH_WIDTH': '8', 'C_M_AXI_SG_ADDR_WIDTH': '32', 'C_M_AXI_SG_DATA_WIDTH': '32', 'C_M_AXIS_MM2S_CNTRL_TDATA_WIDTH': '32', 'C_S_AXIS_S2MM_STS_TDATA_WIDTH': '32', 'C_MICRO_DMA': '0', 'C_INCLUDE_MM2S': '1', 'C_INCLUDE_MM2S_SF': '1', 'C_MM2S_BURST_SIZE': '16', 'C_M_AXI_MM2S_ADDR_WIDTH': '32', 'C_M_A

## Create objects for each element in the overlay dictionary
NOTE: `processing_system7_0` element failed cannot be assigned. Uncomment the last row to see the error.

In [4]:
import pynq.lib.dma

dma0 = overlay.axi_dma_0
accel = overlay.myproject_axi_0
intc = overlay.axi_intc_0
# cpu = overlay.processing_system7_0

## Prepare input/output data
- `y_hls.npy` contains the output generated by the C simulation of the NN (that in principle should be equal to the predictions generated by the IP running on the FPGA).
- `X_test.npy` file represents NN the input data.

Note: unaligned DMA transfer has been enabled from Block Design (no need to set the `start` parameter for `allocate()`).

In [5]:
from pynq import allocate

y_hls = np.load('./y_hls.npy').astype(np.float32)
y_hls = y_hls[:10]
X = np.load('./X_test.npy').astype(np.float32)
X = X[:10]

input_buffer = allocate(shape=X.shape, dtype=np.float32)
output_buffer = allocate(shape=y_hls.shape, dtype=np.float32)


## Transfer data 
The cell above should transfer the allocated vector by using DMA and start the accelerator (is `accel.write(0x00,0x81)` the right way to start the accelerator? `start()` function cannot be used).
After the completion of this cell, the output data should be present in the `output_buffer` array.

In [6]:
input_buffer[:] = X
dma0.sendchannel.transfer(input_buffer)
dma0.recvchannel.transfer(output_buffer)
accel.write(0x00,0x81)
# accel.start()
dma0.sendchannel.wait()
dma0.recvchannel.wait()

## Output check
Check if the data generated by the NN IP are the same as the one obtained via C simulation.

In [7]:
for i in range(5):
    print(output_buffer[i])
    
for i in range(5):
    print(y_hls[i])

print("Array are equal: {}".format(np.array_equal(y_hls, output_buffer)))

[  5.25309576e-38   8.27154254e-40   1.54142831e-44   0.00000000e+00
   0.00000000e+00]
[ 0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.]
[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   8.72535725e-39]
[ 0.45214844  0.31054688  0.03027344  0.14648438  0.15625   ]
[ 0.07128906  0.68066406  0.01367188  0.15136719  0.09765625]
[ 0.11425781  0.05664062  0.84570312  0.          0.00585938]
[ 0.01464844  0.015625    0.98925781  0.          0.00390625]
[ 0.01953125  0.015625    0.96289062  0.          0.00292969]
Array are equal: False


In [8]:
del dma0, accel