In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
from scipy.misc import ascent
from scipy.fftpack import fftn, ifftn, fftshift, fftfreq
from scipy.ndimage import fourier_shift
from skimage.registration import phase_cross_correlation as register_translation
import pycuda.gpuarray as cua
from pynx.processing_unit.cu_processing_unit import CUProcessingUnit
from pynx.utils.registration import register_translation_cuda, register_translation_cuda_n


In [None]:
# Scikit-cuda registration
pixel_shift = (5.3,21.2)
upsample_factor=10

d1 = ascent()[:512,:512]
d2 = abs(ifftn(fourier_shift(fftn(d1), pixel_shift)))

plt.figure(figsize=(12,3))
plt.subplot(121)
plt.imshow(d1)
plt.subplot(122)
plt.imshow(d2)

print(register_translation(d1,d2, upsample_factor=upsample_factor))
print(register_translation_cuda(d1, d2, upsampling=10))


In [None]:
n = 512
upsample_factor=10

d1 = np.random.uniform(0,1, (n,n))
d2 = abs(ifftn(fourier_shift(fftn(d1), pixel_shift)))
# Timing, using on-GPU images
pu = CUProcessingUnit()
pu.init_cuda(test_fft=False, verbose=False)
d1cu = cua.to_gpu(d1.astype(np.complex64))
d2cu = cua.to_gpu(d2.astype(np.complex64))

%timeit -n 3 -r 3 register_translation(d1,d2, upsample_factor=1)
%timeit -n 3 -r 3 register_translation_cuda(d1cu, d2cu, upsampling=1, processing_unit=pu, overwrite=False)

%timeit -n 3 -r 3 register_translation(d1,d2, upsample_factor=upsample_factor)
%timeit -n 3 -r 3 register_translation_cuda(d1cu, d2cu, upsampling=upsample_factor, processing_unit=pu, overwrite=False)
print()
%timeit -n 3 -r 3 register_translation_cuda(d1cu, d2cu, upsampling=upsample_factor, blocksize=8, processing_unit=pu, overwrite=False)
%timeit -n 3 -r 3 register_translation_cuda(d1cu, d2cu, upsampling=upsample_factor, blocksize=16, processing_unit=pu, overwrite=False)
%timeit -n 3 -r 3 register_translation_cuda(d1cu, d2cu, upsampling=upsample_factor, blocksize=32, processing_unit=pu, overwrite=False)
%timeit -n 3 -r 3 register_translation_cuda(d1cu, d2cu, upsampling=upsample_factor, blocksize=64, processing_unit=pu, overwrite=False)


In [None]:
# Aligning a stack of images
n = 512
nz = 100  # Ideally >= number of SM on GPU
upsample_factor=10
dy, dx = [], []

d1 = np.random.uniform(0,1, (n,n))
d2 = np.empty((nz, n,n), dtype=np.float32)
for i in range(nz):
    dy.append(np.random.uniform(-30,30))
    dx.append(np.random.uniform(-30,30))
    d2[i] = abs(ifftn(fourier_shift(fftn(d1), [dy[-1], dx[-1]])))


pu = CUProcessingUnit()
pu.init_cuda(test_fft=False, verbose=False)
d1cu = cua.to_gpu(d1.astype(np.complex64))
d2cu = cua.to_gpu(d2.astype(np.complex64))

tmp = register_translation_cuda(d1cu, d2cu[0], upsampling=upsample_factor, processing_unit=pu,
                                overwrite=False)
tmp = register_translation_cuda(d1cu, d2cu[nz-1], upsampling=upsample_factor, processing_unit=pu, overwrite=False)
vy0, vx0 = register_translation_cuda_n(d1cu, d2cu, upsampling=1, processing_unit=pu, overwrite=False)
vy, vx = register_translation_cuda_n(d1cu, d2cu, upsampling=upsample_factor, processing_unit=pu, overwrite=False)

for i in range(nz):
    print("%6.2f  %6.2f  %6.2f   %6.2f %6.2f  %6.2f" % (-dy[i], vy0[i], vy[i], -dx[i], vx0[i], vx[i]))

s = 10
s2 = 20
t0 = %timeit -n 3 -r 3 -o register_translation(d1,d2[0], upsample_factor=1)
t0s = %timeit -n 3 -r 3 -o register_translation(d1,d2[0], upsample_factor=s)
t1 = %timeit -n 3 -r 3 -o vy, vx = register_translation_cuda(d1cu, d2cu[0], upsampling=1, processing_unit=pu, overwrite=False)
t1s = %timeit -n 3 -r 3 -o vy, vx = register_translation_cuda(d1cu, d2cu[0], upsampling=s, processing_unit=pu, overwrite=False)
t1n = %timeit -n 3 -r 3 -o vy, vx = register_translation_cuda_n(d1cu, d2cu, upsampling=1, processing_unit=pu, overwrite=False)
t1ns = %timeit -n 3 -r 3 -o vy, vx = register_translation_cuda_n(d1cu, d2cu, upsampling=s, processing_unit=pu, overwrite=False)

t0s2 = %timeit -n 3 -r 3 -o register_translation(d1,d2[0], upsample_factor=s2)
t1s2 = %timeit -n 3 -r 3 -o vy, vx = register_translation_cuda(d1cu, d2cu[0], upsampling=s2, processing_unit=pu, overwrite=False)
t1ns2 = %timeit -n 3 -r 3 -o vy, vx = register_translation_cuda_n(d1cu, d2cu, upsampling=s2, processing_unit=pu, overwrite=False)

print("\nRegistration time per image (cuda corresponds to on-GPU data, no transfer)")
print("                        skimage     cuda       cuda")
print("dt [upsampling=  1]    %6.2fms   %6.2fms  %6.2fms"%(t0.average*1000, t1.average*1000, t1n.average*1000/nz))
print("dt [upsampling=%3d]    %6.2fms   %6.2fms  %6.2fms"%(s, t0s.average*1000, t1s.average*1000, t1ns.average*1000/nz))
print("dt [upsampling=%3d]    %6.2fms   %6.2fms  %6.2fms"%(s2, t0s2.average*1000, t1s2.average*1000, t1ns2.average*1000/nz))


In [None]:
# Effect of blocksize for a stack of images
nz2 = 30
%timeit -n 3 -r 3 vy, vx = register_translation_cuda_n(d1cu, d2cu[:nz2], upsampling=10,blocksize=16, processing_unit=pu, overwrite=False)
%timeit -n 3 -r 3 vy, vx = register_translation_cuda_n(d1cu, d2cu[:nz2], upsampling=10,blocksize=32, processing_unit=pu, overwrite=False)
%timeit -n 3 -r 3 vy, vx = register_translation_cuda_n(d1cu, d2cu[:nz2], upsampling=10,blocksize=64, processing_unit=pu, overwrite=False)
%timeit -n 3 -r 3 vy, vx = register_translation_cuda_n(d1cu, d2cu[:nz2], upsampling=10,blocksize=100, processing_unit=pu, overwrite=False)
%timeit -n 3 -r 3 vy, vx = register_translation_cuda_n(d1cu, d2cu[:nz2], upsampling=10,blocksize=200, processing_unit=pu, overwrite=False)
