In [9]:
#use a pynq overlay to compute a 1-D convolution.  The overlay convolves any filter of length 1 to 255
#to substrate data of any length.  This is beyond what the hw question asked for; I pushed the problem
#scope just so I could show a maximum speed advantage over CPU execution. 

import numpy as np
import pynq
import time

overlay = pynq.Overlay("test.bit")
#overlay?
dma_toPL = overlay.axi_dma_from_ps_to_pl
dma_fromPL = overlay.axi_dma_from_pl_to_ps

#generate random data
n1 = 10 #process 5 million random numbers
data1 = np.random.randint(low=0, high=3, size=[n1], dtype=np.uint16) #conv substrate
#data1 = np.array([1,3]).astype(np.uint16)
n2 = 3
data2 = np.random.randint(low=0, high=3, size=[n2], dtype=np.uint16) #conv filter
#data2 = np.array([2,4]).astype(np.uint16)
    
#set up PYNQ data arrays (can be used as numpy arrays, but include physical memory addresses for DMA access)
pynq_data1 = pynq.allocate(shape=(n1,), dtype=np.uint16)
pynq_data2 = pynq.allocate(shape=(n2,), dtype=np.uint16)
pynq_res = pynq.allocate(shape=(n1+n2-1), dtype=np.uint16)
np.copyto(pynq_data1, data1)#copy numpy-generated data to pynq arrays
np.copyto(pynq_data2, data2)
print(pynq_data1)
print(pynq_data2)
data2 = np.flip(data2, axis=0)#overlay convolution does not flip filter - instead, flip numpy filter for cpu convolve

#see how long it takes CPU processing to do operation
start_time = time.time()
cpu_res = np.convolve(data1, data2)
cputime = time.time() - start_time
print("cpu does convolution in %.3f seconds"%cputime)

#execute and time operation in programmable logic
start_time = time.time()

#send filter: overlay is designed to first recieve the relatively short filter, then recive relatively long data
dma_toPL.sendchannel.transfer(pynq_data2)
dma_toPL.sendchannel.wait()

#send data and start waiting for results:
dma_toPL.sendchannel.transfer(pynq_data1)
dma_fromPL.recvchannel.transfer(pynq_res)
dma_toPL.sendchannel.wait()
dma_fromPL.recvchannel.wait()
pltime = time.time()-start_time
print("programmable logic does convolution in in %.3f seconds"%pltime)
print("PL is up to %.2f times faster than snickerdoodle cpu execution"%(cputime/pltime))
print("Difference between cpu and pl results should be zero: %i \n" % np.sum(pynq_res-cpu_res))

print("\n\nvisual inspection of numpy filter:")
print(data2)
print("visual inspection of pynq filter:")
print(pynq_data2)
print("visual inspection of source data:")
print(data1[0:25])
# print("visual inspection of numpy result:")
# print(cpu_res[0:15])
print("visual inspection of pl result:")
print(pynq_res)


[0 0 2 0 1 2 1 2 1 0]
[0 1 0]
cpu does convolution in 0.001 seconds


KeyboardInterrupt: 

In [None]:
from scipy import signal
n1=5
n2=3
in1 = np.random.randint(low=0, high=2, size=[n1,n1], dtype=np.uint16) #conv substrate
in2 = np.random.randint(low=0, high=2, size=[n2,n2], dtype=np.uint16) #conv filter
result = signal.convolve2d(in1, in2, mode='same')

print(in1)
print(in2)
print(result)

In [11]:
data1 = np.array([[1,0,0,1,0], [0,1,1,0,1],[0,0,1,1,0],[0,1,1,1,1],[0,1,0,1,0]]).astype(np.int16)
print('Original \n', data1)
print('Transpose \n', data1.T)
print('Transpose and flatten \n', data1.T.flatten())
data1[1:1+3, :].T.flatten()

Original 
 [[1 0 0 1 0]
 [0 1 1 0 1]
 [0 0 1 1 0]
 [0 1 1 1 1]
 [0 1 0 1 0]]
Transpose 
 [[1 0 0 0 0]
 [0 1 0 1 1]
 [0 1 1 1 0]
 [1 0 1 1 1]
 [0 1 0 1 0]]
Transpose and flatten 
 [1 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 0]


array([0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1], dtype=int16)

In [8]:
#use a pynq overlay to compute a 1-D convolution.  The overlay convolves any filter of length 1 to 255
#to substrate data of any length.  This is beyond what the hw question asked for; I pushed the problem
#scope just so I could show a maximum speed advantage over CPU execution. 

import numpy as np
import pynq
import time

from scipy import signal

def filter_reshape(arr):
    return arr.flatten()

def data_preprocessing(data):
    shape = data.shape
    # TODO: change for 3D array
    padded = np.zeros(tuple(np.add(shape,(2,2))), dtype=np.uint16)
    padded[1:-1, 1:-1] = data
    return padded

def data_reshape(data, start_row):
    return data1[start_row:start_row+3, :].T.flatten()

overlay = pynq.Overlay("test.bit")
#overlay?
dma_toPL = overlay.axi_dma_from_ps_to_pl
dma_fromPL = overlay.axi_dma_from_pl_to_ps

#generate random data
# n1 = 10 #process 5 million random numbers
# data1 = np.random.randint(low=0, high=3, size=[n1], dtype=np.uint16) #conv substrate
# #data1 = np.array([1,3]).astype(np.uint16)
n2 = 3
# data2 = np.random.randint(low=0, high=3, size=[n2], dtype=np.uint16) #conv filter
# #data2 = np.array([2,4]).astype(np.uint16)
    
data2_orig = np.array([[0,0,1], [1,1,1],[1,0,0]]).astype(np.uint16)
data1_orig = np.array([[1,0,0,1,0], [0,1,1,0,1],[0,0,1,1,0],[0,1,1,1,1],[0,1,0,1,0]]).astype(np.int16)
    
# preprocessing and reshape
data2 = filter_reshape(data2_orig)
data1 = data_preprocessing(data1_orig)
data1 = data_reshape(data1, 0)

data_cols = data1_orig.shape[1] + 2 # 2 for padding
print(data_cols-2)

#set up PYNQ data arrays (can be used as numpy arrays, but include physical memory addresses for DMA access)
pynq_data1 = pynq.allocate(shape=(data_cols*3,), dtype=np.uint16)
pynq_data2 = pynq.allocate(shape=(n2*n2,), dtype=np.uint16)
pynq_res = pynq.allocate(shape=(data_cols-2,), dtype=np.uint16)
np.copyto(pynq_data1, data1)#copy numpy-generated data to pynq arrays
np.copyto(pynq_data2, data2)
print(pynq_data1)
print(pynq_data2)
data2 = np.flip(data2, axis=0)#overlay convolution does not flip filter - instead, flip numpy filter for cpu convolve

#see how long it takes CPU processing to do operation
start_time = time.time()
cpu_res = signal.convolve2d(data1_orig, data2_orig, mode='same')
cpu_res = cpu_res[:3].T.flatten()
cputime = time.time() - start_time
print("cpu does convolution in %.3f seconds"%cputime)

#execute and time operation in programmable logic
start_time = time.time()

#send filter: overlay is designed to first recieve the relatively short filter, then recive relatively long data
dma_toPL.sendchannel.transfer(pynq_data2)
dma_toPL.sendchannel.wait()

#send data and start waiting for results:
dma_toPL.sendchannel.transfer(pynq_data1)
dma_fromPL.recvchannel.transfer(pynq_res)
dma_toPL.sendchannel.wait()
dma_fromPL.recvchannel.wait()
pltime = time.time()-start_time
print("programmable logic does convolution in in %.3f seconds"%pltime)
print("PL is up to %.2f times faster than snickerdoodle cpu execution"%(cputime/pltime))
print("Difference between cpu and pl results should be zero: %i \n" % np.sum(pynq_res-cpu_res))

print("\n\nvisual inspection of numpy filter:")
print(data2)
print("visual inspection of pynq filter:")
print(pynq_data2)
print("visual inspection of source data:")
print(data1[0:25])
# print("visual inspection of numpy result:")
# print(cpu_res[0:15])
print("visual inspection of pl result:")
print(pynq_res)


5
[0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0]
[0 0 1 1 1 1 1 0 0]
cpu does convolution in 0.002 seconds


RuntimeError: DMA channel not started