In [1]:
# Choose a backend
# CUDA, AMD, or CPU

# If you have no GPU you can still follow along
# You might want to install a Kernel with threads enable
# `IJulia.installkernel("Julia 1.6.2 Threads", "--threads=auto")` and restart
# this notebook with that kernel.

const BACKEND = :CUDA

:CUDA

In [2]:
using Pkg
Pkg.activate(string(BACKEND, "Env"))

# Install packages
# pkg"add KernelAbstractions, Adapt, Enzyme@0.6.5"
# pkg"add Enzyme"
# pkg"add https://github.com/JuliaGPU/KernelAbstractions.jl#vc/KernelGradients:lib/KernelGradients"

# if BACKEND == :CUDA
#     pkg"add CUDAKernels, CUDA"
# elseif BACKEND == :AMD
#     pkg"add ROCMKernels, AMDGPU"
# end

[32m[1m  Activating[22m[39m environment at `~/juliacon21-gpu_workshop/sneak_peek/CUDAEnv/Project.toml`


In [3]:
using KernelAbstractions, KernelGradients, Adapt, Enzyme

In [4]:
if BACKEND == :CUDA
    using CUDA, CUDAKernels
    const ArrayT = CuArray
    const Device = CUDADevice
elseif BACKEND == :AMD
    using AMDGPU, ROCMKernels
    const ArrayT = CuArray
    const Device = CUDADevice
else BACKEND == :CPU
    const ArrayT = Array
    const Device = CPU
end

CUDADevice

In [5]:
@kernel function matmul_kernel!(out, a, b)
    i, j = @index(Global, NTuple)

    # creating a temporary sum variable for matrix multiplication
    tmp_sum = zero(eltype(out))
    for k = 1:size(a)[2]
        tmp_sum += @inbounds a[i, k] * b[k, j]
    end

    @inbounds out[i,j] = tmp_sum
end

matmul_kernel! (generic function with 5 methods)

In [6]:
a = adapt(ArrayT, rand(64, 128))
b = adapt(ArrayT, rand(128, 32))
c = adapt(ArrayT, zeros(64, 32))
nothing

In [7]:
# @ka_code_typed matmul_kernel!(Device())(c, a, b, ndrange=size(c))

In [8]:
wait(matmul_kernel!(Device())(c, a, b, ndrange=size(c)))

In [9]:
c ≈ a*b

true

In [10]:
matmul_adjoint = autodiff(matmul_kernel!(Device()))

KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, KernelGradients.var"#df#3"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}}(KernelGradients.var"#df#3"{KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}}(KernelAbstractions.Kernel{CUDADevice, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_matmul_kernel!)}(gpu_matmul_kernel!)))

In [11]:
dc = similar(c)
fill!(c, 0)
fill!(dc, 1)
copy_dc = copy(dc)

da = zero(a)
db = zero(b)
nothing

In [12]:
wait(matmul_adjoint(Duplicated(c, dc), Duplicated(a, da), Duplicated(b, db), ndrange=size(c)))

In [13]:
da ≈ copy_dc * b'

true

In [14]:
db ≈ a' * copy_dc

true