In [1]:
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import skcuda.fft as cu_fft

import numpy as np
from scipy.signal import fftconvolve

from pulse2percept import electrode2currentmap as e2cm

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def cufftconvolve_naive(in1, in2):
    # Determine the shape of the output signal
    out_shape = in1.shape[-1] + in2.shape[-1] - 1

    # Zero-pad both input signals
    x1 = np.concatenate((in1, np.zeros(out_shape - in1.shape[-1])))
    x2 = np.concatenate((in2, np.zeros(out_shape - in2.shape[-1])))

    # Move both input signals to GPU memory
    x1_gpu = gpuarray.to_gpu(x1.astype(np.float32))
    x2_gpu = gpuarray.to_gpu(x2.astype(np.float32))

    # Allocate N//2+1 non-redundant FFT coefficients
    f1_gpu = gpuarray.empty(x1.shape[-1] // 2 + 1, np.complex64)
    f2_gpu = gpuarray.empty(x2.shape[-1] // 2 + 1, np.complex64)

    # Calculate N//2+1 non-redundant FFT coefficients
    plan_fft = cu_fft.Plan(x1_gpu.shape, np.float32, np.complex64)
    cu_fft.fft(x1_gpu, f1_gpu, plan_fft)
    cu_fft.fft(x2_gpu, f2_gpu, plan_fft)
    
    # Normalize output product by np.sqrt(self.out_shape ** 2)
    f1_gpu /= out_shape

    # Allocate output signal
    y_gpu = gpuarray.empty(out_shape, np.float32)
    
    # Calculate output signal
    plan_ifft = cu_fft.Plan(out_shape, np.complex64, np.float32)
    cu_fft.ifft(f1_gpu * f2_gpu, y_gpu, plan_ifft)
    
    return y_gpu.get()

In [3]:
class CuFFTConvolve:
    def __init__(self, in1shape, in2shape):
        self.in1shape = in1shape
        self.in2shape = in2shape
        self.out_shape = in1shape + in2shape - 1
        
        # Pre-allocate the zero-padding for the time-series of in1, in2
        self.x1_gpu = gpuarray.zeros(self.out_shape, np.float32)
        self.x2_gpu = gpuarray.zeros(self.out_shape, np.float32)
        
        # Pre-allocate N//2+1 non-redundant FFT coefficients
        self.f1_gpu = gpuarray.empty(self.out_shape // 2 + 1, np.complex64)
        self.f2_gpu = gpuarray.empty(self.out_shape // 2 + 1, np.complex64)

        # Set up a plan for FFT and iFFT (takes time)
        self.plan_fft = cu_fft.Plan(self.out_shape, np.float32, np.complex64)
        self.plan_ifft = cu_fft.Plan(self.out_shape, np.complex64, np.float32)

        # Pre-allocate zero-padded output array
        self.y_gpu = gpuarray.empty(self.out_shape, np.float32)
        
    def cufftconvolve(self, in1, in2):
        assert in1.shape[-1] == self.in1shape
        assert in2.shape[-1] == self.in2shape

        # Trick to minimize copying:
        # Instead of zero-padding in1, in2 and then copying the two large arrays
        # from host to device, merely copy in1, in2 into a much longer array of 
        # zeros that was pre-allocated.
        # Since in1shape, in2shape do not change, we don't have to worry about
        # artifacts from previous method calls.
        cuda.memcpy_htod(self.x1_gpu.gpudata, in1.astype(np.float32))
        cuda.memcpy_htod(self.x2_gpu.gpudata, in2.astype(np.float32))

        # Calculate the N//2+1 non-redundant FFT coefficients
        cu_fft.fft(self.x1_gpu, self.f1_gpu, self.plan_fft)
        cu_fft.fft(self.x2_gpu, self.f2_gpu, self.plan_fft)
        
        # Normalize output product by np.sqrt(self.out_shape ** 2), then take iFFT.
        self.f1_gpu /= self.out_shape
        cu_fft.ifft(self.f1_gpu * self.f2_gpu, self.y_gpu, self.plan_ifft)

        # get() transfers the array from the device back to the host
        return self.y_gpu.get()

In [4]:
# Let's use some signals with lengths that are relevant to pulse2percept:
dur_pt = 0.5                   # typical pulse train duration
tsample = 5e-6                 # typical sampling step
dur_gamma3 = 8 * 26.25 / 1000  # typical gamma3 duration
x1 = np.random.rand(int(dur_pt / tsample))
x2 = np.random.rand(int(dur_gamma3 / tsample))

In [5]:
y_cpu = fftconvolve(x1, x2)
y_gpu_naive = cufftconvolve_naive(x1, x2)
np.allclose(y_cpu, y_gpu_naive, atol=1e-2)

True

In [6]:
cu = CuFFTConvolve(x1.shape[-1], x2.shape[-1])
y_gpu = cu.cufftconvolve(x1, x2)
np.allclose(y_cpu, y_gpu, atol=1e-2)

True

In [7]:
%timeit fftconvolve(x1, x2)

100 loops, best of 3: 7.65 ms per loop


In [8]:
%timeit cufftconvolve_naive(x1, x2)

10 loops, best of 3: 22.6 ms per loop


In [9]:
%timeit cu.cufftconvolve(x1, x2)

1000 loops, best of 3: 983 µs per loop
