In [None]:
####################################################################################
#
# Ptychography: compare OpenCL and CUDA (and CPU) operators results
#
# NOTE: this is mostly for debugging and testing. Mixing the use of CUDA and OpenCL
#       operators is NOT RECOMMENDED, as behaviour heavily relies on underlying
#       code, which can evolve quickly.
#
# (c) ESRF 2019-present
# Authors: Vincent Favre-Nicolin <favre@esrf.fr>
#
####################################################################################
import os

%matplotlib ipympl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from skimage.feature import register_translation
from pynx.ptycho import simulation, shape
import numpy as np
import warnings
warnings.simplefilter('ignore')

# Import Ptycho, PtychoData and operators (automatically imports OpenCL or CUDA operators)
from pynx.ptycho import *
# Load explicitely both OpenCL and CUDA operators
import pynx.ptycho.cl_operator as clop
import pynx.ptycho.cu_operator as cuop
import pynx.ptycho.cpu_operator as cpuop

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
##################
# Create simulated Ptycho dataset
n = 100
pixel_size_detector = 55e-6
wavelength = 1.5e-10
detector_distance = 1
obj_info = {'type': 'phase_ampl', 'phase_stretch': 1.57, 'alpha_win': .2}
# probe_info = {'type': 'focus', 'aperture': (30e-6, 30e-6), 'focal_length': .08, 'defocus': 100e-6, 'shape': (n, n)}
probe_info = {'type': 'gauss', 'sigma_pix': (40, 40), 'defocus': 100e-6, 'shape': (n, n)}

# 50 scan positions correspond to 4 turns, 78 to 5 turns, 113 to 6 turns
scan_info = {'type': 'spiral', 'scan_step_pix': 30, 'n_scans': 32}
data_info = {'num_phot_max': 1e9, 'bg': 0, 'wavelength': wavelength, 'detector_distance': detector_distance,
             'detector_pixel_size': pixel_size_detector,
             'noise': 'poisson'}

# Initialisation of the simulation with specified parameters, specific <object>, <probe> or <scan>positions can be passed as:
# s = ptycho.Simulation(obj=<object>, probe=<probe>, scan = <scan>)
# omitting obj_info, probe_info or scan_info (or passing it as empty dictionary "{}")
simul = simulation.Simulation(obj_info=obj_info, probe_info=probe_info, scan_info=scan_info, data_info=data_info)

# Data simulation: probe.show(), obj.show(), scan.show() and s.show_illumination_sum() will visualise the integrated total coverage of the beam
simul.make_data()

posx, posy = simul.scan.values

pixel_size_object = wavelength * detector_distance / pixel_size_detector / n

ampl = simul.amplitude.values  # square root of the measured diffraction pattern intensity

##################
# Size of the reconstructed object (obj)
nyo, nxo = shape.calc_obj_shape(posx, posy, ampl.shape[1:])

# Initial object
# obj_init_info = {'type':'flat','shape':(nx,ny)}
obj_init_info = {'type': 'random', 'range': (0, 1, 0, 0.5), 'shape': (nyo, nxo)}
# Initial probe
probe_init_info = {'type': 'focus', 'aperture': (20e-6, 20e-6), 'focal_length': .08, 'defocus': 50e-6, 'shape': (n, n)}
data_info = {'wavelength': wavelength, 'detector_distance': detector_distance,
             'detector_pixel_size': pixel_size_detector}
init = simulation.Simulation(obj_info=obj_init_info, probe_info=probe_init_info, data_info=data_info)

init.make_obj()
init.make_probe()

In [None]:
# Create PtychoData object and make sure we only use one stack for simplicity
data = PtychoData(iobs=ampl ** 2, positions=(posx * pixel_size_object, posy * pixel_size_object), 
                  detector_distance=1, mask=None, pixel_size_detector=55e-6, wavelength=1.5e-10)
clop.default_processing_unit.set_stack_size(len(ampl))
cuop.default_processing_unit.set_stack_size(len(ampl))

# Create Ptycho object, a few OpenCL and CUDA DM cycles 
p = Ptycho(probe=simul.probe.values, obj=init.obj.values, data=data, background=None) # Random object start
#p = Ptycho(probe=s.probe.values, obj=s.obj.values, data=data, background=None)  #
# Initial scaling of object and probe
p = cuop.ScaleObjProbe(verbose=True) * p
p = clop.ScaleObjProbe(verbose=True) * p

p = cuop.DM(update_object=True, update_probe=True, calc_llk=20)**100 * p
#p = clop.DM(update_object=True, update_probe=True, calc_llk=10)**200 * p
p = cuop.ML(update_object=True, update_probe=True, calc_llk=20)**40 * p

# Keep copies of object and probe to restore later
obj = p.get_obj().copy()
probe = p.get_probe().copy()

## Basic Fourier scaling test
The scaling for FFT is different for CUDA, OpenCL and CPU (numpy) versions. 

Scale factors must be adjusted to make sure object and probe are correctly scaled with respect to the observed intensity 

In [None]:
p.set_probe(probe)
p.set_obj(obj)
# OpenCL
p = clop.ObjProbe2Psi() * p
v0 =  (np.abs(p._cl_psi.get())**2).sum()
p = clop.FT(scale=False) * p
v1 =  (np.abs(p._cl_psi.get())**2).sum()
p = clop.IFT(scale=False) * p
v2 =  (np.abs(p._cl_psi.get())**2).sum()
print("OpenCL FT scaling (L2 norm): FT  *%12.4f  iFT  /%12.4f   ObjProbe**2/Iobs=%12.4f"
      % (v1 / v0, v1 / v2, v0/p.data.iobs.sum()))

# CUDA
p = cuop.ObjProbe2Psi() * p
v0 =  (np.abs(p._cu_psi.get())**2).sum()
p = cuop.FT(scale=False) * p
v1 =  (np.abs(p._cu_psi.get())**2).sum()
p = cuop.IFT(scale=False) * p
v2 =  (np.abs(p._cu_psi.get())**2).sum()
print("CUDA   FT scaling (L2 norm): FT  *%12.4f  iFT  *%12.4f   ObjProbe**2/Iobs=%12.4f"
      % (v1 / v0, v2 / v1, v0/p.data.iobs.sum()))

# CPU
p = cpuop.ObjProbe2Psi() * p
v0 =  (np.abs(p._psi)**2).sum()
p = cpuop.FT(scale=True) * p
v1 =  (np.abs(p._psi)**2).sum()
p = cpuop.IFT(scale=True) * p
v2 =  (np.abs(p._psi)**2).sum()
print("CPU    FT scaling (L2 norm): FT  *%12.4f  iFT  /%12.4f  [scale=True] ObjProbe**2/Iobs=%12.4f" 
      % (v1 / v0, v1 / v2, v0/p.data.iobs.sum()))


## ObjProbe2Psi

In [None]:
plt.figure(figsize=(12,3))
#probe.fill(1)
#obj.fill(1)
p.set_probe(probe)
p.set_obj(obj)
# OpenCL
p = clop.ObjProbe2Psi() * p
plt.subplot(141)
plt.imshow(fftshift(np.angle(p._cl_psi.get()[0,0,0])), cmap='gray')
plt.title('OpenCL Psi')
plt.colorbar()

# CUDA
p = cuop.ObjProbe2Psi() * p
plt.subplot(142)
plt.imshow(fftshift(np.angle(p._cu_psi.get()[0,0,0])), cmap='gray')
plt.title('CUDA Psi')
plt.colorbar()

# CPU
p = cpuop.ObjProbe2Psi() * p
plt.subplot(143)
plt.imshow(fftshift(np.angle(p._psi[0,0,0])), cmap='gray')
#plt.imshow(fftshift(np.angle(p._psi[0,0,0])-np.angle(p._cu_psi.get()[0,0,0])), cmap='gray')
plt.title('CPU Psi')
plt.colorbar()

# Check shift between ObjProbe
print(register_translation(p._cl_psi.get()[0,0,0], p._cu_psi.get()[0,0,0]))
print(register_translation(p._cl_psi.get()[0,0,0], p._psi[0,0,0]))

atol = np.abs(p._cl_psi.get()).max() * 1e-4
print(np.allclose(p._cl_psi.get(), p._cu_psi.get(), atol=atol, rtol=1e-3))
print(np.allclose(p._psi, p._cu_psi.get(), atol=atol, rtol=1e-4))


p.set_probe(abs(probe))
p.set_obj(abs(obj))
p = cpuop.ObjProbe2Psi() * p
plt.subplot(144)
plt.imshow(np.angle(p._cpu_tmp_quad_phase[0]), cmap='hsv')
plt.title('CPU Psi quadratic phase')
plt.colorbar()

plt.tight_layout()


## FT * ObjProbe2Psi, and scale comparison with Iobs

In [None]:
plt.figure(figsize=(9,3))

# Scale factors
scu = cuop.default_processing_unit.fft_scale(p._psi.shape, ndim=2)[0]
scl = clop.default_processing_unit.fft_scale(p._psi.shape, ndim=2)[0]

p.set_probe(probe)
p.set_obj(obj)
# OpenCL
p = clop.FT(scale=False) * clop.ObjProbe2Psi() * p
plt.subplot(131)
plt.imshow(fftshift(abs(p._cl_psi.get()[0,0,0])) * scl, cmap='gray', norm=colors.LogNorm())
plt.title('OpenCL Psi')
plt.colorbar()
print('OpenCL: <Icalc>/<Iobs>= %12.5f' % ((abs(p._cl_psi.get())**2).sum() / p.data.iobs.sum()))

# CUDA
p = cuop.FT(scale=False) * cuop.ObjProbe2Psi() * p
plt.subplot(132)
plt.imshow(fftshift(abs(p._cu_psi.get()[0,0,0])* scu), cmap='gray', norm=colors.LogNorm())
plt.title('CUDA Psi')
plt.colorbar()
print('CUDA: <Icalc>/<Iobs>= %12.5f' % ((abs(p._cu_psi.get())**2).sum() / p.data.iobs.sum()))

# CPU
p = cpuop.FT(scale=True) * cpuop.ObjProbe2Psi() * p
plt.subplot(133)
plt.imshow(fftshift(abs(p._psi[0,0,0])), cmap='gray', norm=colors.LogNorm())
plt.title('CPU Psi')
plt.colorbar()
print('CPU: <Icalc>/<Iobs>= %12.5f' % ((abs(p._psi)**2).sum() / p.data.iobs.sum()))

plt.tight_layout()

atol = np.abs(p._cu_psi.get()).max() * 1e-4
print(np.allclose(p._cl_psi.get(), p._cu_psi.get(), atol=atol, rtol=1e-4))
print(np.allclose(p._psi * n, p._cu_psi.get(), atol=atol, rtol=1e-4))


## Amplitude projection (manual)

In [None]:
plt.figure(figsize=(9,3))

# fft scales
scu = cuop.default_processing_unit.fft_scale(p._psi.shape, ndim=2)
scl = clop.default_processing_unit.fft_scale(p._psi.shape, ndim=2)
print(scu, cuop.default_processing_unit.use_vkfft)
print(scl, cuop.default_processing_unit.use_vkfft)

p.set_probe(probe)
p.set_obj(obj)
# OpenCL
p = clop.IFT(scale=False) * clop.ApplyAmplitude(scale_in=scl[0], scale_out=scl[1]) * clop.FT(scale=False) * clop.ObjProbe2Psi() * p
plt.subplot(131)
plt.imshow(fftshift(abs(p._cl_psi.get()[0,0,0])), cmap='gray')
plt.title('OpenCL Psi')
plt.colorbar()

p.set_probe(probe)
p.set_obj(obj)
# CUDA
p = cuop.IFT(scale=False) * cuop.ApplyAmplitude(scale_in=scu[0], scale_out=scu[1]) * cuop.FT(scale=False) * cuop.ObjProbe2Psi() * p
plt.subplot(132)
plt.imshow(fftshift(abs(p._cu_psi.get()[0,0,0])), cmap='gray')
plt.title('CUDA Psi')
plt.colorbar()

p.set_probe(probe)
p.set_obj(obj)
# CPU
p = cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
plt.subplot(133)
plt.imshow(fftshift(abs(p._psi[0,0,0])), cmap='gray')
#plt.imshow(fftshift(np.abs(p._psi[0,0,0] - p._cu_psi.get()[0,0,0])), cmap='gray')
plt.title('CPU Psi')
plt.colorbar()

plt.tight_layout()

atol = np.abs(p._cu_psi.get()).max() * 1e-4
print(np.allclose(p._cl_psi.get(), p._cu_psi.get(), atol=atol, rtol=1e-4))
print(np.allclose(p._psi, p._cu_psi.get(), atol=atol, rtol=1e-4))


## Amplitude projection

In [None]:
plt.figure(figsize=(9,3))

p.set_probe(probe)
p.set_obj(obj)
# OpenCL
p = clop.PropagateApplyAmplitude() * clop.ObjProbe2Psi() * p
plt.subplot(131)
plt.imshow(fftshift(abs(p._cl_psi.get()[0,0,0])), cmap='gray')
plt.title('OpenCL Psi')
plt.colorbar()

# CUDA
p = cuop.PropagateApplyAmplitude() * cuop.ObjProbe2Psi() * p
plt.subplot(132)
plt.imshow(fftshift(abs(p._cu_psi.get()[0,0,0])), cmap='gray')
#plt.imshow(fftshift(abs(p._cu_psi.get()[0,0,0]-p._cl_psi.get()[0,0,0])), cmap='gray')
plt.title('CUDA Psi')
plt.colorbar()

# CPU
p = cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
plt.subplot(133)
plt.imshow(fftshift(abs(p._psi[0,0,0])), cmap='gray')
#plt.imshow(fftshift(np.abs(p._psi[0,0,0] - p._cu_psi.get()[0,0,0])), cmap='gray')
plt.title('CPU Psi')
plt.colorbar()

plt.tight_layout()

eps = 1e-5
atol = np.abs(p._cu_psi.get()).max() * eps
print(np.allclose(p._cl_psi.get(), p._cu_psi.get(), atol=atol, rtol=eps))
print(np.allclose(p._psi, p._cu_psi.get(), atol=atol, rtol=eps))


## Log-likelihood

In [None]:
# OpenCL
pu = clop.default_processing_unit
i = p._cl_stack_i
nb_mode = np.int32(p._probe.shape[0] * p._obj.shape[0])
nb_psi = p._cl_obs_v[i].npsi
nxy = np.int32(p._probe.shape[-2] * p._probe.shape[-1])
nxystack = np.int32(pu.cl_stack_size * nxy)
p = clop.FT(scale=False) * clop.ObjProbe2Psi() * p
s = pu.fft_scale(p._cl_psi.shape, ndim=2)[0]
llk = pu.cl_llk(p._cl_obs_v[i].cl_obs[:nb_psi], p._cl_psi, p._cl_background,
                                  nb_mode, nxy, nxystack, s, wait_for=pu.ev).get()
p.llk_poisson = llk['x']
p.llk_gaussian = llk['y']
p.llk_euclidian = llk['z']
p.nb_photons_calc = llk['w']

print("OpenCL LLKp=", p.llk_poisson / p.nb_obs)

# CUDA
p.set_probe(probe)
p.set_obj(obj)
pu = cuop.default_processing_unit
i = 0
p = cuop.SelectStack(i) * p
nb_mode = np.int32(p._probe.shape[0] * p._obj.shape[0])
nb_psi = p._cu_obs_v[i].npsi
nxy = np.int32(p._probe.shape[-2] * p._probe.shape[-1])
nxystack = np.int32(pu.cu_stack_size * nxy)
s = pu.fft_scale(p._cu_psi.shape, ndim=2)[0]
p = cuop.FT(scale=False) * cuop.ObjProbe2Psi() * p
llk = pu.cu_llk(p._cu_obs_v[i].cu_obs[:nb_psi], p._cu_psi, p._cu_background,
                                  nb_mode, nxy, nxystack, s).get()
p.llk_poisson = llk['a']
p.llk_gaussian = llk['b']
p.llk_euclidian = llk['c']
p.nb_photons_calc = llk['d']
print("CUDA   LLKp=", p.llk_poisson / p.nb_obs)

# CPU
p.set_probe(probe)
p.set_obj(obj)
p = cpuop.FT(scale=True) * cpuop.ObjProbe2Psi() * p
iobs = p.data.iobs.flatten()
icalc = (np.abs(p._psi) ** 2).sum(axis=(0, 1)).flatten()

llk = np.zeros(iobs.shape, dtype=np.float32)
idx = np.where(iobs > 0)
llk[idx] = np.take(icalc - iobs + iobs * np.log(iobs / icalc), idx)
idx = np.where(iobs == 0)
llk[idx] = np.take(icalc, idx)
p.llk_poisson = llk.sum()
p.llk_gaussian = ((iobs - icalc) ** 2 / (iobs + 1)).sum()
p.llk_euclidian = 4 * ((np.sqrt(abs(iobs)) - np.sqrt(icalc)) ** 2).sum()

print("CPU    LLKp=", p.llk_poisson / p.nb_obs)

## Object and probe update

In [None]:
plt.figure(figsize=(12,6))

illum = p.get_illumination_obj()
illum /= illum.max()
obj_inertia = 0.1

p.set_probe(probe.copy())
p.set_obj(obj.copy())
# OpenCL
p = clop.Psi2Obj() * clop.Psi2Probe() * clop.PropagateApplyAmplitude() * clop.ObjProbe2Psi() * p
onewcl = p._cl_obj_new.get()[0]
onormcl = p._cl_obj_norm.get()
plt.subplot(231)
plt.imshow(abs(p._cl_obj_new.get().sum(axis=(0))), cmap='gray')
plt.title('OpenCL Obj new (un-normalised)')
plt.colorbar()
# plt.subplot(234)
# plt.imshow(p._cl_obj_norm.get(), cmap='gray')
# plt.title('OpenCL Obj norm')
# plt.colorbar()

p.set_probe(probe.copy())
p.set_obj(obj.copy())
# CUDA
p = cuop.Psi2Obj() * cuop.Psi2Probe() * cuop.PropagateApplyAmplitude() * cuop.ObjProbe2Psi() * p
onewcu = p._cu_obj_new.get()[0]
onormcu = p._cu_obj_norm.get()
plt.subplot(232)
plt.imshow(abs(p._cu_obj_new.get()[0]), cmap='gray')
plt.title('CUDA Obj new (un-normalised)')
plt.colorbar()
# plt.subplot(235)
# plt.imshow(abs(p._cu_obj_norm.get()), cmap='gray')
# plt.title('CUDA Obj norm')
# plt.colorbar()

atol = np.abs(p._cu_obj_new.get()).max() * 1e-4
print(np.allclose(p._cl_obj_new.get(), p._cu_obj_new.get(), atol=atol, rtol=1e-4))
atol = np.abs(p._cu_obj_norm.get()).max() * 1e-4
print(np.allclose(p._cl_obj_norm.get(), p._cu_obj_norm.get(), atol=atol, rtol=1e-4))

p.set_probe(probe.copy())
p.set_obj(obj.copy())
# CPU
p = cpuop.Psi2ObjProbe(True,True, debug=True, obj_inertia=obj_inertia) * cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
onewcpu = p._obj_new[0]
onormcpu = p._obj_norm
ocpu0 = p._obj[0].copy()
plt.subplot(233)
plt.imshow(abs(p._cu_obj_new.get()[0]-p._obj_new[0])/abs(p._obj_new).max(), cmap='gray')
plt.title('CUDA-CPU Obj new diff')
plt.imshow(abs(p._cu_obj_norm.get()-p._obj_norm)/p._obj_norm.max(), cmap='gray')
plt.title('CUDA-CPU Obj norm diff/max')
plt.colorbar()


# Redo the update and merge (this deletes the _obj_new and _obj_norm temp arrays)
p.set_probe(probe.copy())
p.set_obj(obj.copy())
# OpenCL
p = clop.Psi2Obj() * clop.Psi2Probe() * clop.PropagateApplyAmplitude() * clop.ObjProbe2Psi() * p
p = clop.Psi2ObjMerge(inertia=obj_inertia) * clop.Psi2ProbeMerge() * p
p.set_probe(probe)
p.set_obj(obj)
plt.subplot(234)
plt.imshow(abs(p._cl_obj.get()[0]), cmap='gray')
plt.title('OpenCL Obj')
plt.colorbar()

p.set_probe(probe.copy())
p.set_obj(obj.copy())
# CUDA
p = cuop.Psi2Obj() * cuop.Psi2Probe() * cuop.PropagateApplyAmplitude() * cuop.ObjProbe2Psi() * p
p = cuop.Psi2ObjMerge(inertia=obj_inertia) * cuop.Psi2ProbeMerge() * p
plt.subplot(235)
plt.imshow(abs(p._cu_obj.get()[0]), cmap='gray')
plt.title('CUDA Obj')
plt.colorbar()


# plt.subplot(233)
# plt.imshow(abs(p._cu_obj.get()[0]-p._cl_obj.get()[0]), cmap='gray')
# plt.title('CUDA-OpenCl Obj diff')
# plt.colorbar()
plt.subplot(236)
plt.imshow(abs(p._cu_obj.get()[0]-ocpu0)/abs(ocpu0).max(), cmap='gray')
plt.title('CUDA-CPU Obj diff/max')
plt.colorbar()



atol = np.abs(p._cu_obj.get()).max() * 1e-4
print(np.allclose(ocpu0, p._cu_obj.get(), atol=atol, rtol=1e-4))

plt.tight_layout()


## Gradient calculation (step 1 : 1-Iobs/Icalc) * Psi)

In [None]:
# Manual Psi * (1-Iobs/Icalc)
plt.figure(figsize=(12,3))

ny = np.int32(p._probe.shape[-2])
nx = np.int32(p._probe.shape[-1])
nb_probe = np.int32(p._probe.shape[0])
nb_obj = np.int32(p._obj.shape[0])
nb_mode = np.int32(nb_obj * nb_probe)
nyo = np.int32(p._obj.shape[-2])
nxo = np.int32(p._obj.shape[-1])
first_pass = np.int8(1)
nxy = np.int32(ny * nx)
hann_filter = np.int8(1)
f = np.float32(np.pi / (p.data.wavelength * p.data.detector_distance))

# openCL
p.set_probe(probe)
p.set_obj(obj)
pu = clop.default_processing_unit
i = p._cl_stack_i
i0 = p._cl_obs_v[i].i
p._cl_background_grad = p._cl_background.copy()
p._cl_background_grad.fill(np.float32(0))

nb_psi = np.int32(p._cl_obs_v[i].npsi)
nxystack = np.int32(pu.cl_stack_size * nx * ny)

p = clop.ObjProbe2Psi() * p
p = clop.FT(scale=False) * p
print("OpenCL calc, obs / sum(): %8e / %8e = %8e" % ((np.abs(p._cl_psi.get())**2).sum(), p._cl_obs_v[i].cl_obs.get().sum(), (np.abs(p._cl_psi.get())**2).sum() / p._cl_obs_v[i].cl_obs.get().sum()))
s = clop.default_processing_unit.fft_scale(p._cl_psi, ndim=2)
pu.cl_grad_poisson_fourier(p._cl_obs_v[i].cl_obs[0], p._cl_psi, p._cl_background, p._cl_background_grad,
                           nb_mode, nx, ny, nxystack, nb_psi, hann_filter, s[0], s[1], p._cl_scale[i0:i0 + nb_psi])


plt.subplot(141)
plt.imshow(abs(np.fft.fftshift(p._cl_psi.get()[0,0,0])), norm=colors.LogNorm())
plt.title('OpenCL Psi * (1-Iobs/Icalc)')
plt.colorbar()

p = clop.IFT(scale=False) * p

plt.subplot(142)
plt.imshow(abs(np.fft.fftshift(p._cl_psi.get()[0,0,0])))
plt.title('OpenCL iFT(Psi * (1-Iobs/Icalc))')
plt.colorbar()

# CUDA
p.set_probe(probe)
p.set_obj(obj)
i = p._cu_stack_i

pu = cuop.default_processing_unit
nb_psi = p._cu_obs_v[i].npsi
nxystack = np.int32(pu.cu_stack_size * nxy)
p._cu_background_grad = p._cu_background.copy()
p._cu_background_grad.fill(np.float32(0))

p = cuop.ObjProbe2Psi() * p
p = cuop.FT(scale=False) * p
print("  CUDA calc, obs / sum(): %8e / %8e = %8e" % ((np.abs(p._cu_psi.get())**2).sum(), p._cu_obs_v[i].cu_obs.get().sum(), (np.abs(p._cu_psi.get())**2).sum() / p._cu_obs_v[i].cu_obs.get().sum()))
s = cuop.default_processing_unit.fft_scale(p._cu_psi, ndim=2)
pu.cu_grad_poisson_fourier(p._cu_obs_v[i].cu_obs[0], p._cu_psi, p._cu_background, p._cu_background_grad,
                           nb_mode, nx, ny, nxystack, nb_psi, hann_filter, s[0], s[1])

plt.subplot(143)
plt.imshow(abs(np.fft.fftshift(p._cu_psi.get()[0,0,0])), norm=colors.LogNorm())
plt.title('CUDA Psi * (1-Iobs/Icalc)')
plt.colorbar()

p = cuop.IFT(scale=False) * p

plt.subplot(144)
plt.imshow(abs(np.fft.fftshift(p._cu_psi.get()[0,0,0])))
plt.title('OpenCL iFT(Psi * (1-Iobs/Icalc))')
plt.colorbar()


if False:
    # CPU
    plt.subplot(133)
    p.set_probe(probe)
    p.set_obj(obj)
    p = cpuop.Propagate(forward=True) * cpuop.ObjProbe2Psi() * p
    print("CPU calc, obs / sum(): %8e / %8e = %8e" % ((np.abs(p._psi)**2).sum(), p.data.iobs.sum(), (np.abs(p._psi)**2).sum() / p.data.iobs.sum()))
    calc = (abs(p._psi) ** 2).sum(axis=(0, 1))
    p._psi *= (1 - p.data.iobs / calc * s*s) * (p.data.iobs >= 0)  # with broadcasting
    p = cpuop.Propagate(forward=False) * p

    plt.imshow(abs(np.fft.fftshift(p._psi[0,0,0])))
    plt.title('CPU Psi')
    plt.colorbar()

plt.tight_layout()

print(np.allclose(p._cl_psi.get(), p._cu_psi.get(), rtol=1e-3, atol=np.abs(p._cl_psi.get()).max()*1e-3))
# print(np.allclose(p._cl_psi.get(), p._psi, rtol=1e-2, atol=np.abs(p._cl_psi.get()).max()*1e-2))  # Not sure why there remains a difference

## Gradient calculation

In [None]:
plt.figure(figsize=(12,3))

# OpenCL
p.set_probe(probe)
p.set_obj(obj*1.1)
p._cl_obj_grad = clop.cla.zeros(clop.default_processing_unit.cl_queue, p._obj.shape, np.complex64)
p._cl_probe_grad = clop.cla.zeros(clop.default_processing_unit.cl_queue, p._probe.shape, np.complex64)
p = clop.Grad(update_object=True, update_probe=True) * p
cl_obj_grad = p._cl_obj_grad.get()
cl_probe_grad = p._cl_probe_grad.get()
plt.subplot(141)
plt.imshow(abs(cl_obj_grad[0]), cmap='gray')
plt.title('OpenCL obj grad')
plt.colorbar()
plt.subplot(142)
plt.imshow(abs(cl_probe_grad[0]), cmap='gray')
plt.title('OpenCL probe grad')
plt.colorbar()

# CUDA
p.set_probe(probe)
p.set_obj(obj*1.1)
p._cu_obj_grad = cuop.cua.zeros_like(p._cu_obj)
p._cu_probe_grad = cuop.cua.zeros_like(p._cu_probe)
p = cuop.Grad(update_object=True, update_probe=True) * p
cu_obj_grad = p._cu_obj_grad.get()
cu_probe_grad = p._cu_probe_grad.get()
plt.subplot(143)
plt.imshow(abs(cu_obj_grad[0]), cmap='gray')
plt.title('CUDA obj grad')
plt.colorbar()
plt.subplot(144)
plt.imshow(abs(cu_probe_grad[0]), cmap='gray')
plt.title('CUDA probe grad')
plt.colorbar()

plt.tight_layout()

# Objet gradient differs a bit probably due to different operation order (atomic add on for CUDA)
print(np.allclose(p._cl_obj_grad.get(), p._cu_obj_grad.get(), rtol=1e-3, atol=np.abs(p._cl_obj_grad.get()).max()*1e-3))
print(np.allclose(p._cl_probe_grad.get(), p._cu_probe_grad.get(), rtol=1e-3, atol=np.abs(p._cl_probe_grad.get()).max()*1e-3))

#plt.figure()
#plt.imshow(np.abs(p._cl_obj_grad.get()[0]/p._cu_obj_grad.get()[0]), vmin=0.99,vmax=1.01)

## AP

In [None]:
ncycle = 5
zpr = False
update_obj, update_probe = True, True
plt.figure(figsize=(12,6))

# Get the cillumination (this will use one backend operator, so don't mix it)
illum =  p.get_illumination_obj()
illum /= illum.max()
illum = illum > 0.1



p.set_probe(probe.copy())
p.set_obj(obj.copy())
# OpenCL
p = clop.AP(update_object=update_obj, update_probe=update_probe, zero_phase_ramp=zpr)**ncycle * p
plt.subplot(231)
plt.imshow(abs(p._cl_obj.get()[0]), cmap='gray')
plt.title('OpenCL Obj')
plt.colorbar()
plt.subplot(234)
plt.imshow(abs(p._cl_probe.get()[0]), cmap='gray')
plt.title('OpenCL Probe')
plt.colorbar()

p.set_probe(probe.copy())
p.set_obj(obj.copy())
# CUDA
p = cuop.AP(update_object=update_obj, update_probe=update_probe, zero_phase_ramp=zpr)**ncycle * p
plt.subplot(232)
#plt.imshow(abs(p._cu_obj.get()[0]), cmap='gray')
plt.imshow(abs(p._cu_obj.get()[0]-p._cl_obj.get()[0])/abs(p._obj).max()*illum, cmap='gray')
plt.title('OpenCL-CUDA Obj diff / max')
plt.colorbar()
plt.subplot(235)
plt.imshow(abs(p._cu_probe.get()[0]-p._cl_probe.get()[0])/abs(p._probe).max(), cmap='gray')
plt.title('OpenCL-CUDA Probe diff / max')
plt.colorbar()

p.set_probe(probe.copy())
p.set_obj(obj.copy())
# CPU
#p = cpuop.Psi2ObjProbe(True,True) * cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
p = cpuop.AP(update_object=update_obj, update_probe=update_probe, zero_phase_ramp=zpr)**ncycle * p
plt.subplot(233)
#plt.imshow(abs(p._obj[0]), cmap='gray')
plt.imshow(np.abs(p._obj[0] - p._cu_obj.get()[0])/abs(p._obj).max()*illum, cmap='gray')
plt.title('CPU-CUDA Obj diff / max')
plt.colorbar()
plt.subplot(236)
#plt.imshow(abs(p._probe[0]), cmap='gray')
plt.imshow(np.abs(p._probe[0] - p._cu_probe.get()[0])/abs(p._probe).max(), cmap='gray')
plt.title('CPU-CUDA Probe diff / max')
plt.colorbar()

plt.tight_layout()

eps = 1e-4
atol = np.abs(p._cu_obj.get()).max() * eps
print(np.allclose(p._cl_obj.get(), p._cu_obj.get(), atol=atol, rtol=eps))
print(np.allclose(p._obj, p._cu_obj.get(), atol=atol, rtol=eps))
atol = np.abs(p._cu_probe.get()).max() * eps
print(np.allclose(p._cl_probe.get(), p._cu_probe.get(), atol=atol, rtol=eps))
print(np.allclose(p._probe, p._cu_probe.get(), atol=atol, rtol=eps))

## DM**2

In [None]:
plt.figure(figsize=(12,3))

p.set_probe(probe)
p.set_obj(obj)
# OpenCL
p = clop.DM(update_object=True, update_probe=True)**2 * p
plt.subplot(141)
plt.imshow(abs(p._cl_obj.get()[0]), cmap='gray')
plt.title('OpenCL Obj')
plt.colorbar()
plt.subplot(142)
plt.imshow(fftshift(abs(p._cl_psi.get()[0,0,0])), cmap='gray')
plt.title('OpenCL Psi')
plt.colorbar()

# CUDA
p = cuop.DM(update_object=True, update_probe=True)**2 * p
plt.subplot(143)
plt.imshow(abs(p._cu_obj.get()[0]), cmap='gray')
plt.title('CUDA Obj')
plt.colorbar()
plt.subplot(144)
plt.imshow(fftshift(abs(p._cu_psi.get()[0,0,0])), cmap='gray')
plt.title('CUDA Psi')
plt.colorbar()

atol = np.abs(p._cu_obj.get()).max() * 1e-4
print(np.allclose(p._cl_obj.get(), p._cu_obj.get(), atol=atol, rtol=1e-4))
atol = np.abs(p._cu_psi.get()).max() * 1e-4
print(np.allclose(p._cl_psi.get(), p._cu_psi.get(), atol=atol, rtol=1e-4))

if False:
    # CPU
    p = cpuop.Psi2ObjProbe(True,True) * cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
    plt.subplot(133)
    plt.imshow(abs(p._psi[0,0,0]), cmap='gray')
    plt.title('CPU Psi')
    plt.colorbar()

plt.tight_layout()

## ML**2

In [None]:
plt.figure(figsize=(12,3))

# OpenCL
p.set_probe(probe)
p.set_obj(obj)
p = clop.ML(update_object=True, update_probe=True)**2 * p
plt.subplot(141)
plt.imshow(abs(p._cl_obj.get()[0]), cmap='gray')
plt.title('OpenCL Obj')
plt.colorbar()
plt.subplot(142)
plt.imshow(abs(p._cl_probe.get()[0]), cmap='gray')
plt.title('OpenCL Probe')
plt.colorbar()

# CUDA
p.set_probe(probe)
p.set_obj(obj)
p = cuop.ML(update_object=True, update_probe=True)**2 * p
plt.subplot(143)
plt.imshow(abs(p._cu_obj.get()[0]), cmap='gray')
plt.title('CUDA Obj')
plt.colorbar()
plt.subplot(144)
plt.imshow(abs(p._cu_probe.get()[0]), cmap='gray')
plt.title('CUDA Probe')
plt.colorbar()

# ML
atol = np.abs(p._cu_obj.get()).max() * 1e-3
print(np.allclose(p._cl_obj.get(), p._cu_obj.get(), atol=atol, rtol=1e-4))
atol = np.abs(p._cu_psi.get()).max() * 1e-3
print(np.allclose(p._cl_probe.get(), p._cu_probe.get(), atol=atol, rtol=1e-4))

if False:
    # CPU
    p = cpuop.Psi2ObjProbe(True,True) * cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
    plt.subplot(133)
    plt.imshow(abs(p._psi[0,0,0]), cmap='gray')
    plt.title('CPU Psi')
    plt.colorbar()

plt.tight_layout()

In [None]:
# OpenCL
p.set_probe(probe)
p.set_obj(obj)
p = clop.ML(update_object=True, update_probe=True, calc_llk=1)**20 * p
print()
# CUDA
p.set_probe(probe)
p.set_obj(obj)
p = cuop.ML(update_object=True, update_probe=True, calc_llk=1)**20 * p


## Background update (AP)

In [None]:
plt.figure(figsize=(14,3))

# OpenCL
p.set_probe(probe)
p.set_obj(obj)
p = clop.AP(update_object=True, update_probe=True, update_background=True)**2 * p
plt.subplot(141)
plt.imshow(abs(p._cl_obj.get()[0]), cmap='gray')
plt.title('OpenCL Obj')
plt.colorbar()
plt.subplot(142)
plt.imshow(fftshift(abs(p._cl_background.get())), cmap='gray')
plt.title('OpenCL background')
plt.colorbar()

# CUDA
p.set_probe(probe)
p.set_obj(obj)
p = cuop.AP(update_object=True, update_probe=True, update_background=True)**2 * p
plt.subplot(143)
plt.imshow(abs(p._cu_obj.get()[0]), cmap='gray')
plt.title('CUDA Obj')
plt.colorbar()
plt.subplot(144)
plt.imshow(fftshift(abs(p._cu_background.get())), cmap='gray')
plt.title('CUDA background')
plt.colorbar()

atol = np.abs(p._cu_obj.get()).max() * 1e-3
print(np.allclose(p._cl_obj.get(), p._cu_obj.get(), atol=atol, rtol=1e-4))
atol = np.abs(p._cu_background.get()).max() * 1e-3
print(np.allclose(p._cl_background.get(), p._cu_background.get(), atol=atol, rtol=1e-4))

if False:
    # CPU
    p = cpuop.Psi2ObjProbe(True,True) * cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
    plt.subplot(133)
    plt.imshow(abs(p._psi[0,0,0]), cmap='gray')
    plt.title('CPU Psi')
    plt.colorbar()

plt.tight_layout()

## Background update (ML)

In [None]:
if False:
    # Background update during ML is unsupported yet
    plt.figure(figsize=(14,3))

    nb_obj, nyo, nxo = obj.shape
    nb_probe, ny, nx = probe.shape

    # OpenCL
    p.set_probe(probe)
    p.set_obj(obj)
    p._cl_obj_grad = clop.cla.empty(clop.default_processing_unit.cl_queue, (nb_obj, nyo, nxo), np.complex64)
    p._cl_obj_grad_last = clop.cla.empty(clop.default_processing_unit.cl_queue, (nb_obj, nyo, nxo), np.complex64)
    p._cl_probe_grad = clop.cla.empty(clop.default_processing_unit.cl_queue, (nb_probe, ny, nx), np.complex64)
    p._cl_probe_grad_last = clop.cla.empty(clop.default_processing_unit.cl_queue, (nb_probe, ny, nx), np.complex64)
    p._cl_background_grad = clop.cla.zeros(clop.default_processing_unit.cl_queue, (ny, nx), np.float32)
    p._cl_background_grad_last = clop.cla.zeros(clop.default_processing_unit.cl_queue, (ny, nx), np.float32)
    p._cl_background_dir = clop.cla.zeros(clop.default_processing_unit.cl_queue, (ny, nx), np.float32)

    p = clop.Grad(update_object=True, update_probe=True, update_background=True) * p
    plt.subplot(141)
    plt.imshow(abs(p._cl_obj.get()[0]), cmap='gray')
    plt.title('OpenCL Obj')
    plt.colorbar()
    plt.subplot(142)
    plt.imshow(fftshift(abs(p._cl_background_grad.get())), cmap='gray')
    plt.title('OpenCL background')
    plt.colorbar()

    # CUDA
    p.set_probe(probe)
    p.set_obj(obj)
    p._cu_obj_grad = cuop.cua.empty_like(p._cu_obj)
    p._cu_obj_grad_last = cuop.cua.empty_like(p._cu_obj)
    p._cu_probe_grad = cuop.cua.empty_like(p._cu_probe)
    p._cu_probe_grad_last = cuop.cua.empty_like(p._cu_probe)
    p._cu_background_grad = cuop.cua.zeros((ny, nx), np.float32, allocator=cuop.default_processing_unit.cu_mem_pool.allocate)
    p._cu_background_grad_last = cuop.cua.zeros((ny, nx), np.float32, allocator=cuop.default_processing_unit.cu_mem_pool.allocate)
    p._cu_background_dir = cuop.cua.zeros((ny, nx), np.float32, allocator=cuop.default_processing_unit.cu_mem_pool.allocate)

    p = cuop.Grad(update_object=True, update_probe=True, update_background=True)**2 * p
    plt.subplot(143)
    plt.imshow(abs(p._cu_obj.get()[0]), cmap='gray')
    plt.title('CUDA Obj')
    plt.colorbar()
    plt.subplot(144)
    plt.imshow(fftshift(abs(p._cu_background_grad.get())), cmap='gray')
    plt.title('CUDA background')
    plt.colorbar()

    atol = np.abs(p._cu_obj.get()).max() * 1e-3
    print(np.allclose(p._cl_obj.get(), p._cu_obj.get(), atol=atol, rtol=1e-4))
    atol = np.abs(p._cu_background.get()).max() * 1e-3
    print(np.allclose(p._cl_background.get(), p._cu_background.get(), atol=atol, rtol=1e-4))

    if False:
        # CPU
        p = cpuop.Psi2ObjProbe(True,True) * cpuop.PropagateApplyAmplitude() * cpuop.ObjProbe2Psi() * p
        plt.subplot(133)
        plt.imshow(abs(p._psi[0,0,0]), cmap='gray')
        plt.title('CPU Psi')
        plt.colorbar()

    plt.tight_layout()