In [3]:
#use a pynq overlay to compute a 1-D convolution.  The overlay convolves any filter of length 1 to 255
#to substrate data of any length.  This is beyond what the hw question asked for; I pushed the problem
#scope just so I could show a maximum speed advantage over CPU execution. 

import numpy as np
import pynq
import time


overlay = pynq.Overlay("conv1D.bit")
dma_toPL = overlay.axi_dma_from_ps_to_pl
dma_fromPL = overlay.axi_dma_from_pl_to_ps

#generate random data
n1 = 5000000 #process 5 million random numbers
data1 = np.random.randint(low=0, high=2**12, size=[n1], dtype=np.uint16) #conv substrate
n2 = 255
data2 = np.random.randint(low=0, high=2**12, size=[n2], dtype=np.uint16) #conv filter
    
#set up PYNQ data arrays (can be used as numpy arrays, but include physical memory addresses for DMA access)
pynq_data1 = pynq.allocate(shape=(n1,), dtype=np.uint16)
pynq_data2 = pynq.allocate(shape=(n2,), dtype=np.uint16)
pynq_res = pynq.allocate(shape=(n1+n2-1), dtype=np.uint16)
np.copyto(pynq_data1, data1)#copy numpy-generated data to pynq arrays
np.copyto(pynq_data2, data2)
data2 = np.flip(data2, axis=0)#overlay convolution does not flip filter - instead, flip numpy filter for cpu convolve

#see how long it takes CPU processing to do operation
start_time = time.time()
cpu_res = np.convolve(data1, data2)
cputime = time.time() - start_time
print("cpu does convolution in %.3f seconds"%cputime)

#execute and time operation in programmable logic
start_time = time.time()

#send filter: overlay is designed to first recieve the relatively short filter, then recive relatively long data
dma_toPL.sendchannel.transfer(pynq_data2)
dma_toPL.sendchannel.wait()

#send data and start waiting for results:
dma_toPL.sendchannel.transfer(pynq_data1)
dma_fromPL.recvchannel.transfer(pynq_res)
dma_toPL.sendchannel.wait()
dma_fromPL.recvchannel.wait()
pltime = time.time()-start_time
print("programmable logic does convolution in in %.3f seconds"%pltime)
print("PL is up to %.2f times faster than snickerdoodle cpu execution"%(cputime/pltime))
print("Difference between cpu and pl results should be zero: %i" % np.sum(pynq_res-cpu_res))

# print("\n\nvisual inspection of numpy filter:")
# print(data2)
# print("visual inspection of pynq filter:")
# print(pynq_data2)
# print("visual inspection of source data:")
# print(data1[0:25])
# print("visual inspection of numpy result:")
# print(cpu_res[0:15])
# print("visual inspection of pl result:")
# print(pynq_res[0:15])



cpu does convolution in 7.585 seconds
programmable logic does convolution in in 0.035 seconds
PL is up to 213.72 times faster than snickerdoodle cpu execution
Difference between cpu and pl results should be zero: 0
