Skip to content

Commit

Permalink
Merge pull request #107 from omlins/lr/hip
Browse files Browse the repository at this point in the history
Add AMDGPU v0.5 support
  • Loading branch information
omlins committed Jul 22, 2023
2 parents a76bb75 + c921de9 commit 4a23081
Show file tree
Hide file tree
Showing 15 changed files with 165 additions and 185 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
fail-fast: false
matrix:
version:
- '1.8' # Minimum required Julia version (due to CellArrays' AMDGPU dependency 1.7 and due to Enzyme 1.8).
# - '1.8' # Minimum required Julia version (due to CellArrays' AMDGPU dependency 1.7 and due to Enzyme 1.8).
- '1' # Latest stable 1.x release of Julia
# - 'nightly'
os:
Expand Down
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "ParallelStencil"
uuid = "94395366-693c-11ea-3b26-d9b7aac5d958"
authors = ["Samuel Omlin", "Ludovic Räss"]
version = "0.8.0"
version = "0.8.1"

[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
Expand All @@ -13,7 +13,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[compat]
AMDGPU = "0.4.14"
AMDGPU = "0.5"
CUDA = "3.12, 4"
CellArrays = "0.1"
Enzyme = "0.11"
Expand Down
14 changes: 7 additions & 7 deletions src/ParallelKernel/allocators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -233,13 +233,13 @@ macro falses_cuda(args...) check_initialized(); esc(_falses(args...; package=
macro trues_cuda(args...) check_initialized(); esc(_trues(args...; package=PKG_CUDA)); end
macro fill_cuda(args...) check_initialized(); esc(_fill(args...; package=PKG_CUDA)); end
macro fill!_cuda(args...) check_initialized(); esc(_fill!(args...; package=PKG_CUDA)); end
macro zeros_amdgpu(args...) check_initialized(); esc(_zeros(args...; package=PKG_AMDGPU)); end
macro ones_amdgpu(args...) check_initialized(); esc(_ones(args...; package=PKG_AMDGPU)); end
macro rand_amdgpu(args...) check_initialized(); esc(_rand(args...; package=PKG_AMDGPU)); end
macro falses_amdgpu(args...) check_initialized(); esc(_falses(args...; package=PKG_AMDGPU)); end
macro trues_amdgpu(args...) check_initialized(); esc(_trues(args...; package=PKG_AMDGPU)); end
macro fill_amdgpu(args...) check_initialized(); esc(_fill(args...; package=PKG_AMDGPU)); end
macro fill!_amdgpu(args...) check_initialized(); esc(_fill!(args...; package=PKG_AMDGPU)); end
macro zeros_amdgpu(args...) check_initialized(); esc(_zeros(args...; package=PKG_AMDGPU)); end
macro ones_amdgpu(args...) check_initialized(); esc(_ones(args...; package=PKG_AMDGPU)); end
macro rand_amdgpu(args...) check_initialized(); esc(_rand(args...; package=PKG_AMDGPU)); end
macro falses_amdgpu(args...) check_initialized(); esc(_falses(args...; package=PKG_AMDGPU)); end
macro trues_amdgpu(args...) check_initialized(); esc(_trues(args...; package=PKG_AMDGPU)); end
macro fill_amdgpu(args...) check_initialized(); esc(_fill(args...; package=PKG_AMDGPU)); end
macro fill!_amdgpu(args...) check_initialized(); esc(_fill!(args...; package=PKG_AMDGPU)); end
macro zeros_threads(args...) check_initialized(); esc(_zeros(args...; package=PKG_THREADS)); end
macro ones_threads(args...) check_initialized(); esc(_ones(args...; package=PKG_THREADS)); end
macro rand_threads(args...) check_initialized(); esc(_rand(args...; package=PKG_THREADS)); end
Expand Down
4 changes: 2 additions & 2 deletions src/ParallelKernel/kernel_language.jl
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ end

function gridDim(args...; package::Symbol=get_package())
if (package == PKG_CUDA) return :(CUDA.gridDim($(args...)))
elseif (package == PKG_AMDGPU) return :(AMDGPU.gridDimWG($(args...)))
elseif (package == PKG_AMDGPU) return :(AMDGPU.gridGroupDim($(args...)))
elseif (package == PKG_THREADS) return :(ParallelStencil.ParallelKernel.@gridDim_cpu($(args...)))
else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
end
Expand Down Expand Up @@ -170,7 +170,7 @@ end

function pk_println(args...; package::Symbol=get_package())
if (package == PKG_CUDA) return :(CUDA.@cuprintln($(args...)))
elseif (package == PKG_AMDGPU) @KeywordArgumentError("this functionality is not yet supported in AMDGPU.jl.")
elseif (package == PKG_AMDGPU) return :(AMDGPU.@rocprintln($(args...)))
elseif (package == PKG_THREADS) return :(Base.println($(args...)))
else @KeywordArgumentError("$ERRMSG_UNSUPPORTED_PACKAGE (obtained: $package).")
end
Expand Down
24 changes: 12 additions & 12 deletions src/ParallelKernel/parallel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,16 @@ macro synchronize(args...) check_initialized(); esc(synchronize(args...)); end
## MACROS FORCING PACKAGE, IGNORING INITIALIZATION

macro parallel_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_CUDA)); end
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_AMDGPU)); end
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_AMDGPU)); end
macro parallel_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__module__, args...; package=PKG_THREADS)); end
macro parallel_indices_cuda(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_CUDA)); end
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_AMDGPU)); end
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_AMDGPU)); end
macro parallel_indices_threads(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(args...; package=PKG_THREADS)); end
macro parallel_async_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_CUDA)); end
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_AMDGPU)); end
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_AMDGPU)); end
macro parallel_async_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__module__, args...; package=PKG_THREADS)); end
macro synchronize_cuda(args...) check_initialized(); esc(synchronize(args...; package=PKG_CUDA)); end
macro synchronize_amdgpu(args...) check_initialized(); esc(synchronize(args...; package=PKG_AMDGPU)); end
macro synchronize_amdgpu(args...) check_initialized(); esc(synchronize(args...; package=PKG_AMDGPU)); end
macro synchronize_threads(args...) check_initialized(); esc(synchronize(args...; package=PKG_THREADS)); end


Expand Down Expand Up @@ -158,11 +158,11 @@ function parallel_kernel(package::Symbol, numbertype::DataType, indices::Union{S
body = get_body(kernel)
body = remove_return(body)
if isgpu(package)
kernel = substitute(kernel, :(Data.Array), :(Data.DeviceArray))
kernel = substitute(kernel, :(Data.Cell), :(Data.DeviceCell))
kernel = substitute(kernel, :(Data.Array), :(Data.DeviceArray))
kernel = substitute(kernel, :(Data.Cell), :(Data.DeviceCell))
kernel = substitute(kernel, :(Data.CellArray), :(Data.DeviceCellArray))
kernel = substitute(kernel, :(Data.TArray), :(Data.DeviceTArray))
kernel = substitute(kernel, :(Data.TCell), :(Data.DeviceTCell))
kernel = substitute(kernel, :(Data.TArray), :(Data.DeviceTArray))
kernel = substitute(kernel, :(Data.TCell), :(Data.DeviceTCell))
kernel = substitute(kernel, :(Data.TCellArray), :(Data.DeviceTCellArray))
end
kernel = push_to_signature!(kernel, :($RANGES_VARNAME::$RANGES_TYPE))
Expand Down Expand Up @@ -297,7 +297,7 @@ end
## @SYNCHRONIZE FUNCTIONS

synchronize_cuda(args::Union{Symbol,Expr}...) = :(CUDA.synchronize($(args...)))
synchronize_amdgpu(args::Union{Symbol,Expr}...) = :(ParallelStencil.ParallelKernel.synchronize_rocstream($(args...))) #TODO: this supports currently only stream synchronization. Whole GPU synchronization (all streams) should also be supported.
synchronize_amdgpu(args::Union{Symbol,Expr}...) = :(AMDGPU.synchronize($(args...)))
synchronize_threads(args::Union{Symbol,Expr}...) = :(begin end)


Expand Down Expand Up @@ -518,13 +518,13 @@ function create_gpu_call(package::Symbol, nblocks::Union{Symbol,Expr}, nthreads:
if launch
if !isnothing(shmem)
if (package == PKG_CUDA) shmem_expr = :(shmem = $shmem)
elseif (package == PKG_AMDGPU) shmem_expr = :(localmem = $shmem)
elseif (package == PKG_AMDGPU) shmem_expr = :(shmem = $shmem)
else @ModuleInternalError("unsupported GPU package (obtained: $package).")
end
backend_kwargs_expr = (backend_kwargs_expr..., shmem_expr)
end
if (package == PKG_CUDA) return :( CUDA.@cuda blocks=$nblocks threads=$nthreads stream=$stream $(backend_kwargs_expr...) $kernelcall; $synccall )
elseif (package == PKG_AMDGPU) return :( ParallelStencil.ParallelKernel.push_signal!($stream, AMDGPU.@roc gridsize=($nblocks .* $nthreads) groupsize=$nthreads $(backend_kwargs_expr...) queue=$stream.queue $kernelcall); $synccall )
elseif (package == PKG_AMDGPU) return :( AMDGPU.@roc gridsize=$nblocks groupsize=$nthreads stream=$stream $(backend_kwargs_expr...) $kernelcall; $synccall )
else @ModuleInternalError("unsupported GPU package (obtained: $package).")
end
else
Expand All @@ -544,7 +544,7 @@ end

function default_stream(package)
if (package == PKG_CUDA) return :(CUDA.stream()) # Use the default stream of the task.
elseif (package == PKG_AMDGPU) return :(ParallelStencil.ParallelKernel.get_default_rocstream())
elseif (package == PKG_AMDGPU) return :(AMDGPU.HIPStream()) # Use the default stream of the task.
else @ModuleInternalError("unsupported GPU package (obtained: $package).")
end
end
42 changes: 6 additions & 36 deletions src/ParallelKernel/shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -103,50 +103,20 @@ macro rangelengths() esc(:(($(RANGELENGTHS_VARNAMES...),))) end
end

@static if ENABLE_AMDGPU
## Stream implementation for AMDGPU. It is the responsibility of the package developers to keep the ROCStreams consistent by pushing each signal received from a kernel launch on queue=stream.queue to the ROCStream using push_signal!. If ROCQueues are to be exposed to the users, then a macro should be implemented to automatize this (e.g. overwrite @roc to accept the kwarg stream...).
mutable struct ROCStream
queue::AMDGPU.ROCQueue
last_signal::Union{Nothing, AMDGPU.ROCKernelSignal}

function ROCStream(device::ROCDevice; priority::Union{Nothing,Symbol}=nothing)
queue = ROCQueue(device; priority=priority)
new(queue, nothing)
end
function ROCStream(queue::ROCQueue)
new(queue, nothing)
end
end

function push_signal!(stream::ROCStream, signal::AMDGPU.ROCKernelSignal)
AMDGPU.barrier_and!(stream.queue, [signal])
stream.last_signal = signal
end

function synchronize_rocstream(stream::ROCStream)
AMDGPU.wait(stream.last_signal)
end

let
global get_priority_rocstream, get_rocstream, get_default_rocstream
priority_rocstreams = Array{ROCStream}(undef, 0)
rocstreams = Array{ROCStream}(undef, 0)
default_rocstreams = Array{ROCStream}(undef, 0)
global get_priority_rocstream, get_rocstream
priority_rocstreams = Array{AMDGPU.HIPStream}(undef, 0)
rocstreams = Array{AMDGPU.HIPStream}(undef, 0)

function get_priority_rocstream(id::Integer)
while (id > length(priority_rocstreams)) push!(priority_rocstreams, ROCStream(AMDGPU.default_device(); priority=:high)) end # :high is max priority.
while (id > length(priority_rocstreams)) push!(priority_rocstreams, AMDGPU.HIPStream(:high)) end
return priority_rocstreams[id]
end

#TODO: check if set priority to normal!
function get_rocstream(id::Integer)
while (id > length(rocstreams)) push!(rocstreams, ROCStream(AMDGPU.default_device(); priority=:low)) end # :low min priority.
while (id > length(rocstreams)) push!(rocstreams, AMDGPU.HIPStream(:low)) end
return rocstreams[id]
end

function get_default_rocstream()
if (length(default_rocstreams)==0) push!(default_rocstreams, ROCStream(AMDGPU.default_queue())) end # NOTE: this implementation is extensible to multiple defaults as available in CUDA for streams.
return default_rocstreams[1]
end
end
end

Expand Down Expand Up @@ -267,7 +237,7 @@ function split_parallel_args(args; is_call=true)
posargs, kwargs = split_args(args[1:end-1])
kernelarg = args[end]
if (is_call && any([x.args[1] in [:blocks, :threads] for x in kwargs])) @KeywordArgumentError("Invalid keyword argument in @parallel <kernelcall>: blocks / threads. They must be passed as positional arguments or been omited.") end
if (is_call && any([x.args[1] in [:groupsize, :gridsize, :queue] for x in kwargs])) @KeywordArgumentError("Invalid keyword argument in @parallel <kernelcall>: groupsize / gridsize / queue. CUDA nomenclature and concepts are to be used for @parallel calls (and kernels).") end
if (is_call && any([x.args[1] in [:groupsize, :gridsize] for x in kwargs])) @KeywordArgumentError("Invalid keyword argument in @parallel <kernelcall>: groupsize / gridsize. CUDA nomenclature and concepts are to be used for @parallel calls (and kernels).") end
return posargs, kwargs, kernelarg
end

Expand Down
12 changes: 6 additions & 6 deletions src/parallel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ macro parallel_async(args...) check_initialized(); checkargs_parallel(args...);

## MACROS FORCING PACKAGE, IGNORING INITIALIZATION

macro parallel_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_CUDA)); end
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_AMDGPU)); end
macro parallel_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_THREADS)); end
macro parallel_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_CUDA)); end
macro parallel_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_AMDGPU)); end
macro parallel_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel(__source__, __module__, args...; package=PKG_THREADS)); end
macro parallel_indices_cuda(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_CUDA)); end
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_AMDGPU)); end
macro parallel_indices_amdgpu(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_AMDGPU)); end
macro parallel_indices_threads(args...) check_initialized(); checkargs_parallel_indices(args...); esc(parallel_indices(__source__, __module__, args...; package=PKG_THREADS)); end
macro parallel_async_cuda(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_CUDA)); end
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_AMDGPU)); end
macro parallel_async_amdgpu(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_AMDGPU)); end
macro parallel_async_threads(args...) check_initialized(); checkargs_parallel(args...); esc(parallel_async(__source__, __module__, args...; package=PKG_THREADS)); end


Expand Down Expand Up @@ -268,7 +268,7 @@ end
function parallel_call_memopt(caller::Module, kernelcall::Expr, backend_kwargs_expr::Array, async::Bool; memopt::Bool=false, configcall::Expr=kernelcall)
metadata_call = create_metadata_call(configcall)
metadata_module = metadata_call
loopdim = :($(metadata_module).loopdim)
loopdim = :($(metadata_module).loopdim)
is_parallel_kernel = :($(metadata_module).is_parallel_kernel)
ranges = :( ($is_parallel_kernel) ? ParallelStencil.get_ranges_memopt($loopdim, $(configcall.args[2:end]...)) : ParallelStencil.ParallelKernel.get_ranges($(configcall.args[2:end]...)))
parallel_call_memopt(caller, ranges, kernelcall, backend_kwargs_expr, async; memopt=memopt, configcall=configcall)
Expand Down
6 changes: 3 additions & 3 deletions test/ParallelKernel/test_kernel_language.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ end
# @test @prettystring(1, @pk_show()) == "CUDA.@cushow"
# @test @prettystring(1, @pk_println()) == "CUDA.@cuprintln"
elseif $package == $AMDGPU
@test @prettystring(1, @gridDim()) == "AMDGPU.gridDimWG()"
@test @prettystring(1, @gridDim()) == "AMDGPU.gridGroupDim()"
@test @prettystring(1, @blockIdx()) == "AMDGPU.workgroupIdx()"
@test @prettystring(1, @blockDim()) == "AMDGPU.workgroupDim()"
@test @prettystring(1, @threadIdx()) == "AMDGPU.workitemIdx()"
@test @prettystring(1, @sync_threads()) == "AMDGPU.sync_workgroup()"
#@test @prettystring(1, @sharedMem(Float32, (2,3))) == "" #TODO: not yet supported for AMDGPU
# @test @prettystring(1, @sharedMem(Float32, (2,3))) == "" #TODO: not yet supported for AMDGPU
# @test @prettystring(1, @pk_show()) == "CUDA.@cushow" #TODO: not yet supported for AMDGPU
# @test @prettystring(1, @pk_println()) == "CUDA.@rocprintln"
# @test @prettystring(1, @pk_println()) == "AMDGPU.@rocprintln"
elseif $package == $PKG_THREADS
@test @prettystring(1, @gridDim()) == "ParallelStencil.ParallelKernel.@gridDim_cpu"
@test @prettystring(1, @blockIdx()) == "ParallelStencil.ParallelKernel.@blockIdx_cpu"
Expand Down
Loading

2 comments on commit 4a23081

@omlins
Copy link
Owner Author

@omlins omlins commented on 4a23081 Jul 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/88077

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.8.1 -m "<description of version>" 4a23081af0236ffc9cff41d45dea192029edc96a
git push origin v0.8.1

Please sign in to comment.