In [2]:
using FourierFlows
using CUDA
grid = TwoDGrid(Lx=2π, nx=128)

TwoDimensionalGrid
  ├───────────────── Device: CPU
  ├────────────── FloatType: Float64
  ├────────── size (Lx, Ly): (6.283185307179586, 6.283185307179586)
  ├──── resolution (nx, ny): (128, 128)
  ├── grid spacing (dx, dy): (0.04908738521234052, 0.04908738521234052)
  ├───────────────── domain: x ∈ [-3.141592653589793, 3.0925052683774528]
  |                          y ∈ [-3.141592653589793, 3.0925052683774528]
  └─ aliased fraction: 0.3333333333333333

In [35]:
using StaticArrays
function expMatrixSA(Nk, Nl)
    dt = 0.1
    ic(k, l) = @SVector rand(3)
    sol = cu(ic.(1:Nk, (1:Nl)'))
    Lop(k, l) = SMatrix{3,3}([0 1 1im*k; -1 0 1im*l; -1im*k -1im*l 0])
    expLdt = exp.(cu(dt * Lop.(1:Nk, (1:Nl)')))
    return expLdt, sol
end

function expMatrixMul1(A, sol)
    return A * sol
end

function expMatrixMul2(A, sol)
    return @views sum(A .* sol, dims=4)[:, :, :, 1]
end

function expMatrixBanded(Nk, Nl)
    dt = 0.1
    Nk = 5
    Nl = 5
    sol = CUDA.rand(Nk, Nl, 3)
    Lop(k, l) = [0 1 1im*k; -1 0 1im*l; -1im*k -1im*l 0]
    A = zeros(Complex{Float64}, Nk, Nl, 3, 3)
    for i=1:Nk
        for j=1:Nl
            A[i, j, :, :] .= exp(dt * Lop(i, j))
        end
    end
    expdtL = cu(A)
    return expdtL, sol
end

N = 512
A, x = expMatrixSA(N, N)
B, y = expMatrixBanded(N, N)
using BenchmarkTools

In [36]:
@benchmark CUDA.@sync expMatrixMul1(A, x)

BenchmarkTools.Trial: 235 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m20.945 ms[22m[39m … [35m 23.078 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m21.270 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m21.271 ms[22m[39m ± [32m133.221 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▄[39m▆[34m█[39m[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▂[39m▃[39m▃[39m▁[39m▁

In [37]:
@benchmark CUDA.@sync  expMatrixMul2(B, y)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m50.948 μs[22m[39m … [35m971.538 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m52.708 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m53.391 μs[22m[39m ± [32m 12.167 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m▂[39m▃[39m▆[39m▇[39m▇[39m█[39m▇[39m▆[34m▄[39m[39m▂[39m▁[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▁[39m▁[39m▂[39m▂[39m▄

In [21]:

#cat(eachslice(result, dims=1); dims=3)
#expLdt = exp.(dt * Lop.(1:Nk, (1:Nl)'))
#exp2Ldt = exp.(2 * dt * cu(Lop.(1:Nk, (1:Nl)')))
#result = expLdt * sol
#return getindex.(result, 2)

5×5×3×1 CuArray{ComplexF32, 4, CUDA.DeviceMemory}:
[:, :, 1, 1] =
 0.107478+0.0102278im    1.07586+0.107229im    …   0.16392+0.0188252im
 0.982304+0.180058im   0.0299058+0.00558969im     0.142103+0.0285178im
  1.06252+0.2843im      0.395983+0.106748im       0.258749+0.0724093im
 0.850911+0.29511im      1.06709+0.369927im        1.08697+0.382423im
 0.843941+0.354623im     0.29104+0.121515im       0.636366+0.265032im

[:, :, 2, 1] =
 0.114782+0.0120717im   0.708698+0.150507im   …  0.238262+0.117779im
 0.217744+0.0217021im   0.475367+0.0978605im     0.579212+0.278117im
 0.288355+0.0272679im   0.145246+0.0290777im     0.487837+0.228379im
 0.127684+0.0114689im    0.37884+0.073994im       0.53422+0.244712im
 0.262353+0.022409im   0.0524999+0.0100369im     0.501206+0.225433im

[:, :, 3, 1] =
  0.168779-0.0334771im  0.774097-0.224272im   …   1.12612-0.601366im
 0.0715153-0.0214194im  0.252472-0.0982192im     0.269443-0.168405im
  0.652264-0.258392im   0.727531-0.351671im       1.08413-0.770101

In [45]:
println(A[1,1])
println(exp(Lop(1,1)))
println(exp.(Lop(1,1)))

ComplexF64[1.0 + 0.0im 1.7182818284590455 + 0.0im 0.0 + 1.7182818284590453im; -0.6321205588285577 + 0.0im 1.0 + 0.0im 0.0 + 0.6321205588285577im; 0.0 - 0.6321205588285578im 0.0 - 1.7182818284590453im 2.0861612696304874 + 0.0im]
ComplexF64[1.0 + 0.0im 1.7182818284590455 + 0.0im 0.0 + 1.7182818284590453im; -0.6321205588285577 + 0.0im 1.0 + 0.0im 0.0 + 0.6321205588285577im; 0.0 - 0.6321205588285578im 0.0 - 1.7182818284590453im 2.0861612696304874 + 0.0im]
ComplexF64[1.0 + 0.0im 2.718281828459045 + 0.0im 0.5403023058681398 + 0.8414709848078965im; 0.36787944117144233 + 0.0im 1.0 + 0.0im 0.5403023058681398 + 0.8414709848078965im; 0.5403023058681398 - 0.8414709848078965im 0.5403023058681398 - 0.8414709848078965im 1.0 + 0.0im]


In [31]:
SMatrix{2,2}(1,2,3,4)

2×2 SMatrix{2, 2, Int64, 4} with indices SOneTo(2)×SOneTo(2):
 1  3
 2  4

In [191]:
Nx = 257
Ny = 512
m = 3
A = CUDA.rand(Float32, Nx, Ny, m, m) + 1im * CUDA.rand(Float32, Nx, Ny, m, m)
B = CUDA.zeros(Complex{Float32}, Nx, Ny, m, m)
#x = CUDA.ones(4, 4, 2)
function kernel_exp(A, B, Nx, Ny, dt)
    i = blockDim().x * (blockIdx().x - 1) + threadIdx().x
    j = blockDim().y * (blockIdx().y - 1) + threadIdx().y
    if i > Nx || j > Ny
        return
    end
    @inbounds Lkj = SMatrix{3, 3}(@view(A[i, j, :, :]))
    @inbounds @view(B[i, j, :, :]) .= CUDA.exp(dt * Lkj)
    return
end

config_kernel = @cuda launch=false kernel_exp(A, B)
max_threads = CUDA.maxthreads(config_kernel)
println(max_threads)

thread_size = 2^(floor(Int, log2(max_threads)/2))
num_threads_x = min(thread_size, Nx)
num_threads_y = min(thread_size, Ny)
num_blocks_x = cld(Nx, num_threads_x)
num_blocks_y = cld(Ny, num_threads_y)
println(num_threads_x, " ", num_threads_y)
println(num_blocks_x, " ", num_blocks_y)
println(num_blocks_x * num_threads_x, " ", num_blocks_y * num_threads_y)
@cuda threads=(num_threads_x, num_threads_y) blocks=(num_blocks_x, num_blocks_y) kernel_exp(A, B, Nx, Ny, 0.01f0)

256
16 16
17 32
272 512


CUDA.HostKernel for kernel_exp(CuDeviceArray{ComplexF32, 4, 1}, CuDeviceArray{ComplexF32, 4, 1}, Int64, Int64, Float32)

In [190]:
log2(2^6)

6.0

In [156]:
20^2/32

12.5

In [157]:
cld(floor(Int, sqrt(640))^2,)

10

In [41]:
using StaticArrays

A = @SVector rand(3)

3-element SVector{3, Float64} with indices SOneTo(3):
 0.1548346425381577
 0.535549285243215
 0.20345566198433562

In [16]:

A[1, 1, :, :] .= [1 1; 0 1]
A[2, 1, :, :] .= [2 1; 0 1]
x[1, 1, :] = [1; 2]
x[2, 1, :] = [2; 1]
y = sum(A .* x, dims=4)[:,:,:,1]
y[1, 1, :]

2-element Vector{Float64}:
 2.0
 2.0

In [38]:
[1 1; 0 1] * [1; 2]

2-element Vector{Int64}:
 3
 2

In [74]:
using CUDA
using StaticArrays

function mv_mul_kernel(y, A, x, Nx, Ny)
    i = blockDim().x * (blockIdx().x - 1) + threadIdx().x
    j = blockDim().y * (blockIdx().y - 1) + threadIdx().y
    if i > Nx || j > Ny
        return
    end
    @inbounds Lkj = SMatrix{3, 3}(@view(A[i, j, :, :]))
    @inbounds xkj = SVector{3}(@view(x[i, j, :]))
    @inbounds @view(y[i, j, :]) .= Lkj * xkj
    return
end

function mvmul!(y, A, x)
    y .= dropdims(sum(permutedims(A, [1, 2, 4, 3]) .* x, dims=3), dims=3)
end

Nx = 257
Ny = 512
m = 3
A = CUDA.rand(Complex{Float32}, Nx, Ny, m, m)
x = CUDA.rand(Complex{Float32}, Nx, Ny, m)
orig_x = deepcopy(x)
#y = similar(x)

config_kernel = @cuda launch=false mv_mul_kernel(x, A, x, Nx, Ny)
max_threads = CUDA.maxthreads(config_kernel)
thread_size = 2^(floor(Int, log2(max_threads)/2))
num_threads_x = min(thread_size, Nx)
num_threads_y = min(thread_size, Ny)
num_blocks_x = cld(Nx, num_threads_x)
num_blocks_y = cld(Ny, num_threads_y)
@CUDA.time CUDA.@sync begin
    @cuda threads=(num_threads_x, num_threads_y) blocks=(num_blocks_x, num_blocks_y) mv_mul_kernel(x, A, x, Nx, Ny)
end

#@CUDA.time @sync mvmul!(y, A, x)
#@CUDA.profile mvmul!(y, A, x)
A[1, 1, :, :] * orig_x[1, 1, :] ≈ x[1, 1, :]

  0.000147 seconds (47 CPU allocations: 1.344 KiB)


true

In [96]:
function Lop_kernel(result, k, l, Nx, Ny, D, f, Cg2)
    i = blockDim().x * (blockIdx().x - 1) + threadIdx().x
    j = blockDim().y * (blockIdx().y - 1) + threadIdx().y
    if i > Nx || j > Ny
        return
    end
    result[i, j, 1, 1] = -D[i,j]
    result[i, j, 1, 2] =  f
    result[i, j, 1, 3] = -1im*k[i]*Cg2

    result[i, j, 2, 1] = -f
    result[i, j, 2, 2] = -D[i,j]
    result[i, j, 2, 3] = -1im*l[j]*Cg2

    result[i, j, 3, 1] = -1im*k[i]
    result[i, j, 3, 2] = -1im*l[j]
    result[i, j, 3, 3] = -D[i,j]
    return
end

function populate_L!(L, grid, params, dev)
    D = @. - params.ν * grid.Krsq^(params.nν)
    
    config_kernel = @cuda launch=false Lop_kernel(L, grid.kr, grid.l, grid.nkr, grid.nl, D, params.f, params.Cg2)
    max_threads = CUDA.maxthreads(config_kernel)
    thread_size = 2^(floor(Int, log2(max_threads)/2))
    num_threads_x = min(thread_size, grid.nkr)
    num_threads_y = min(thread_size, grid.nl)
    num_blocks_x = cld(grid.nkr, num_threads_x)
    num_blocks_y = cld(grid.nl, num_threads_y)
    CUDA.@sync begin
        @cuda threads=(num_threads_x, num_threads_y) blocks=(num_blocks_x, num_blocks_y) Lop_kernel(L, grid.kr, grid.l, grid.nkr, grid.nl, D, params.f, params.Cg2)
    end
end

using FourierFlows
dev = GPU()
grid = TwoDGrid(dev, nx=128, Lx=2π)
L = zeros(dev, Complex{Float32}, (grid.nkr, grid.nl, 3, 3))
params = (f = 1.0f0, Cg2=1.0f0, ν=1.0f0, nν=4)
populate_L!(L, grid, params, dev)

CUDA.HostKernel for Lop_kernel(CuDeviceArray{ComplexF32, 4, 1}, CuDeviceMatrix{Float64, 1}, CuDeviceMatrix{Float64, 1}, Int64, Int64, CuDeviceMatrix{Float64, 1}, Float32, Float32)