In [1]:
include("raytracing/GPURaytracing.jl")
using .GPURaytracing

In [4]:
using CUDA, OrdinaryDiffEq, BenchmarkTools, LinearAlgebra, FourierFlows

grid = TwoDGrid(GPU(); nx=512, Lx=2π, T=Float32)
mask = @. 9 <= grid.kr^2 + grid.l^2 < 36
A = 1000.0f0
ψh = A * (CUDA.rand(grid.nkr, grid.nl) + 1im * CUDA.rand(grid.nkr, grid.nl))
ψh[.~mask] .= 0.f0

ψ = irfft(ψh, grid.nx, (1, 2))

U = irfft(-1im * grid.l  .* ψh, grid.nx, (1, 2))
V = irfft( 1im * grid.kr .* ψh, grid.nx, (1, 2))

Ux = irfft( grid.l  .* grid.kr .* ψh, grid.nx, (1, 2))
Vx = irfft(-grid.kr .* grid.kr .* ψh, grid.nx, (1, 2))
Uy = irfft( grid.l  .* grid.l  .* ψh, grid.nx, (1, 2))
Vy = -Ux

velocity1 = Velocity(U, V)
gradient1 = VelocityGradient(Ux, Uy, Vx, Vy)
velocity2 = Velocity(U, V)
gradient2 = VelocityGradient(Ux, Uy, Vx, Vy)

VelocityGradient(Float32[-0.041082524 -0.08592519 … 0.048200794 0.0036578998; -0.039745167 -0.08618337 … 0.052694872 0.0065803602; … ; -0.04296215 -0.084401205 … 0.03958269 -0.001604557; -0.042154267 -0.08532934 … 0.043828256 0.0009289831], Float32[1.7433773 1.7430874 … 1.7303483 1.7391233; 1.6977682 1.6974195 … 1.6851475 1.6936702; … ; 1.8296431 1.8294444 … 1.8158785 1.8251132; 1.7873586 1.7871187 … 1.7739482 1.7829607], Float32[-1.645809 -1.646198 … -1.6401613 -1.6437926; -1.5592438 -1.5599357 … -1.5532116 -1.5569984; … ; -1.805511 -1.805304 … -1.8006448 -1.8039539; -1.7279725 -1.7280616 … -1.722714 -1.726186], Float32[0.041082524 0.08592519 … -0.048200794 -0.0036578998; 0.039745167 0.08618337 … -0.052694872 -0.0065803602; … ; 0.04296215 0.084401205 … -0.03958269 0.001604557; 0.042154267 0.08532934 … -0.043828256 -0.0009289831])

In [3]:
Np = 1000

wavepacket_array = CuArray{Float32}(undef, Np, 4)
x = @views wavepacket_array[:, 1:2]
k = @views wavepacket_array[:, 3]
l = @views wavepacket_array[:, 4]
x .= grid.Lx * CUDA.rand(Np, 2) .+ grid.x[1]
phase = 2π * CUDA.rand(Np)
@. k = cos(phase)
@. l = sin(phase)

wavepacket_array

1000×4 CuArray{Float32, 2, CUDA.DeviceMemory}:
 -3.0856     2.89212    -0.445822   -0.895122
  3.08647   -0.234708   -0.414097   -0.910233
  1.89742   -2.87474     0.82967    -0.558255
 -1.23314   -2.21199    -0.935151    0.354249
  2.14182   -2.1523      0.629377   -0.7771
  1.28044    1.90146     0.993145   -0.116893
 -2.83758   -1.26733     0.456475    0.889736
 -0.279517   0.449924   -0.450005   -0.893026
  2.96737    1.20589    -0.991747   -0.128213
 -2.90581    0.305089   -0.971898   -0.235402
 -0.511575   1.98351    -0.600151   -0.799887
  1.31448    2.48613     0.866483    0.499207
  1.51256    2.44705    -0.449619   -0.893221
  ⋮                                 
  2.77868    0.0227401  -0.553601    0.832782
  1.17536   -1.22068    -0.0222173  -0.999753
  2.31804   -1.43775     0.920722   -0.390219
  2.85235   -1.78418     0.489781    0.871846
 -2.62725    2.65872    -0.462811    0.886457
 -1.8984     2.22814     0.315449   -0.948943
  1.62526   -0.804607    0.505627   -0.86275

In [62]:
GPURaytracing.solve!(velocity1, velocity2, gradient1, gradient2, grid, wavepacket_array, 1.0f0, (0.0f0, 0.1f0), (f = 1.0f0, Cg = 1.0f0))

1000×4 CuArray{Float32, 2, CUDA.DeviceMemory}:
 -11.9967     -1.74882    1.38889     1.02545
  38.8723    -10.0933     0.487031    0.780969
  27.6838    -30.7953    -0.708083    0.824885
   0.417075   45.9193     1.18024     0.325704
  -3.81401     1.55787   -0.817147    0.286775
   1.69387   -14.3405    -0.87909    -0.728571
  -9.39997    46.1369     0.341356   -1.00939
  20.3812    -38.7632    -0.97276    -0.453745
  19.3143     45.0519     0.357962    1.00105
 -52.7491      1.68981   -0.929159   -0.354018
  -0.775043   -5.44458   -0.846333    0.607966
 -18.7364      0.977051   0.531703    0.861323
 -28.4478     20.4632    -0.956113    0.130534
   ⋮                                 
  -6.29716    52.8408     0.0237523   0.89639
 -17.7076     21.8832    -0.0232561   1.02763
 -20.5178    -14.9959    -0.423419    1.05353
  42.0013     20.331     -0.677052   -0.494378
  -8.73408    37.387      0.453991    0.922678
 -15.7568     13.4675    -0.40042     1.04167
  14.3217     -9.23165    0.7

In [7]:
@inline function dispersion_relation(k::Float32, l::Float32, f::Float32, Cg::Float32)::Float32
    return sqrt(f^2 + Cg*(k^2 + l^2))
end

p = (f=1.0f0, Cg=1.0f0)

k1 = @views wavepacket_array[:, 3]
k2 = @views wavepacket_array[:, 4]

dispersion_relation.(k1, k2, p.f, p.Cg)

1000-element CuArray{Float32, 1, CUDA.DeviceMemory}:
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 ⋮
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135
 1.4142135

In [49]:
function CgBroadcast(texU1, texU2) 
    p = (f = 1.0f0, Cg = 1.0f0, t0 = 0.0f0, t1 = 1.0f0, x0 = -π, Lx = 2π, Nx=512)
    t = 0.5
    
    alpha = (t - p.t0) / (p.t1 - p.t0)
    
    x = @views wavepacket_array[:, 1:2]
    norm_x = @. (x - p.x0) / p.Lx + 0.5 / p.Nx
    nx = @views norm_x[:, 1]
    ny = @views norm_x[:, 2]
    k1 = @views wavepacket_array[:, 3]
    k2 = @views wavepacket_array[:, 4]
    
    ω = @. dispersion_relation(k1, k2, p.f, p.Cg)
    Cg_x = @. k1 / ω
    
    
    dx1 = CuArray{Float32}(undef, Np)
    
    broadcast!(dx1, nx, ny, Cg_x, Ref(texU1), Ref(texU2), alpha) do xi, yi, cgx, U1, U2, alpha
        alpha * U1[xi, yi] + (1.0f0 - alpha) * U2[xi, yi] + cgx 
    end
end

function CgSeparate(texU1, texU2) 
    p = (f = 1.0f0, Cg = 1.0f0, t0 = 0.0f0, t1 = 1.0f0, x0 = -π, Lx = 2π, Nx=512)
    t = 0.5
    
    alpha = (t - p.t0) / (p.t1 - p.t0)
    
    x = @views wavepacket_array[:, 1:2]
    norm_x = @. (x - p.x0) / p.Lx + 0.5 / p.Nx
    nx = @views norm_x[:, 1]
    ny = @views norm_x[:, 2]
    k1 = @views wavepacket_array[:, 3]
    k2 = @views wavepacket_array[:, 4]
    
    dx1 = CuArray{Float32}(undef, Np)
    
    broadcast!(dx1, nx, ny, Ref(texU1), Ref(texU2), alpha) do xi, yi, U1, U2, alpha
        alpha * U1[xi, yi] + (1.0f0 - alpha) * U2[xi, yi]
    end

    dx1 .+= k1 ./ dispersion_relation.(k1, k2, p.f, p.Cg)
end

texU1  = CuTexture(CuTextureArray(velocity1.u);  interpolation=CUDA.LinearInterpolation(), address_mode=CUDA.ADDRESS_MODE_WRAP, normalized_coordinates=true)
texU2  = CuTexture(CuTextureArray(velocity2.u);  interpolation=CUDA.LinearInterpolation(), address_mode=CUDA.ADDRESS_MODE_WRAP, normalized_coordinates=true)

@btime CUDA.@sync CgBroadcast(texU1, texU2)
@btime CUDA.@sync  CgSeparate(texU1, texU2)

  116.258 μs (496 allocations: 12.70 KiB)
  86.982 μs (402 allocations: 10.12 KiB)


1000-element CuArray{Float32, 1, CUDA.DeviceMemory}:
 -0.61299276
 -0.26239848
  0.5904165
 -0.69782037
  0.3015013
  0.7228992
  0.40448335
 -0.3468381
 -0.74215394
 -0.7447526
 -0.4573735
  0.6172285
 -0.32244894
  ⋮
 -0.41026768
 -0.014729745
  0.73839396
  0.17618671
 -0.17330842
  0.12084352
  0.37240428
 -0.31934983
 -0.68752736
  0.11805056
  0.7131002
 -0.78841186

In [27]:
T = Float32
N = 3
src = rand(T, fill(10, N)...)

# indices we want to interpolate
idx = [tuple(rand(1:0.1:10, N)...) for _ in 1:10]

# upload to the GPU
gpu_src = CuArray(src)
gpu_idx = CuArray(idx)

println(size(gpu_src))
# create a texture array for optimized fetching
# this is required for N=1, optional for N=2 and N=3
gpu_src = CuTextureArray(gpu_src)

# interpolate using a texture
gpu_dst = CuArray{T}(undef, size(gpu_idx))
gpu_tex = CuTexture(gpu_src; interpolation=CUDA.NearestNeighbour())
broadcast!(gpu_dst, gpu_idx, Ref(gpu_tex)) do idx, tex
    tex[idx...]
end

# back to the CPU
dst = Array(gpu_dst)

(10, 10, 10)


10-element Vector{Float32}:
 0.31153327
 0.79749954
 0.18864238
 0.8577519
 0.41945142
 0.6596827
 0.23358303
 0.22312123
 0.7977649
 0.5681535