From 8eeab5fa5454e1ab052e6fb2bb0e57094b366e00 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Wed, 17 Sep 2025 09:37:29 -0700 Subject: [PATCH 01/14] Add `GroupMemoryBarrierWithGroupSync` tests --- test/WaveOps/GroupMemoryBarrierWithSync.test | 95 ++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 test/WaveOps/GroupMemoryBarrierWithSync.test diff --git a/test/WaveOps/GroupMemoryBarrierWithSync.test b/test/WaveOps/GroupMemoryBarrierWithSync.test new file mode 100644 index 000000000..485047139 --- /dev/null +++ b/test/WaveOps/GroupMemoryBarrierWithSync.test @@ -0,0 +1,95 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out : register(u1); + +groupshared uint4 SharedData; + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) { + + // Basic broadcast + if (tid.x == 0) { + for (uint i = 0; i < 10; i++) { + SharedData = i * In[0]; + } + } + GroupMemoryBarrierWithGroupSync(); + Out[0][tid.x] = SharedData[tid.x]; + + // Divergent blocking + int offset = tid.x < 2 ? 0 : 2; + switch (offset) { + case 0: + Out[1][tid.x] = SharedData[tid.x]; + GroupMemoryBarrierWithGroupSync(); + break; + case 2: + Out[1][tid.x] = 2 * SharedData[tid.x]; + GroupMemoryBarrierWithGroupSync(); + break; + } + + // Interlocked accumulation within for loop + for (uint i = 0; i < 4; i++) { + if (tid.x == i) { + SharedData[0] += In[0][tid.x]; + Out[2][tid.x] = SharedData[0]; + } + GroupMemoryBarrierWithGroupSync(); + } + + // Strided writes + uint index = (tid.x * 3) % 4; + SharedData[tid.x] = In[0][index]; + GroupMemoryBarrierWithGroupSync(); + + Out[3][tid.x] = SharedData[tid.x]; +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int32 + Stride: 4 + Data: [ 1, 10, 100, 1000] + - Name: Out + Format: Int32 + Stride: 16 + ZeroInitSize: 64 + - Name: ExpectedOut + Format: Int32 + Stride: 16 + Data: [ 9, 90, 900, 9000, 9, 90, 1800, 18000, 10, 20, 120, 1120, 1, 1000, 100, 10 ] +Results: + - Result: ExpectedOut + Rule: BufferExact + Actual: Out + Expected: ExpectedOut +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 +... +#--- end + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o From 19d4d42f6db5748b55609964e56c34af5c121ed3 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Wed, 17 Sep 2025 10:52:10 -0700 Subject: [PATCH 02/14] self-review: mark as unsupported for clang --- test/WaveOps/GroupMemoryBarrierWithSync.test | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithSync.test b/test/WaveOps/GroupMemoryBarrierWithSync.test index 485047139..b7553965f 100644 --- a/test/WaveOps/GroupMemoryBarrierWithSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithSync.test @@ -42,7 +42,6 @@ void main(uint3 tid : SV_GroupThreadID) { uint index = (tid.x * 3) % 4; SharedData[tid.x] = In[0][index]; GroupMemoryBarrierWithGroupSync(); - Out[3][tid.x] = SharedData[tid.x]; } @@ -90,6 +89,10 @@ DescriptorSets: ... #--- end +# No lowering to DXIL Barrier/SPIRV OpMemoryBarrier +# Unsupported https://github.com/llvm/llvm-project/issues/99121 +# UNSUPPORTED: Clang + # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o From d84771e723d1e404d0f758bb1dadec27437d5f18 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Thu, 18 Sep 2025 12:01:38 -0700 Subject: [PATCH 03/14] review: correct format/channels and readability --- test/WaveOps/GroupMemoryBarrierWithSync.test | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithSync.test b/test/WaveOps/GroupMemoryBarrierWithSync.test index b7553965f..26a9e2bc5 100644 --- a/test/WaveOps/GroupMemoryBarrierWithSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithSync.test @@ -54,17 +54,22 @@ Shaders: DispatchSize: [1, 1, 1] Buffers: - Name: In - Format: Int32 - Stride: 4 + Format: UInt32 + Channels: 4 Data: [ 1, 10, 100, 1000] - Name: Out - Format: Int32 - Stride: 16 + Format: UInt32 + Channels: 4 ZeroInitSize: 64 - Name: ExpectedOut - Format: Int32 - Stride: 16 - Data: [ 9, 90, 900, 9000, 9, 90, 1800, 18000, 10, 20, 120, 1120, 1, 1000, 100, 10 ] + Format: UInt32 + Channels: 4 + Data: [ + 9, 90, 900, 9000, # Broadcast + 9, 90, 1800, 18000, # Divergent + 10, 20, 120, 1120, # Accumulation + 1, 1000, 100, 10 # Strided Reorder + ] Results: - Result: ExpectedOut Rule: BufferExact From 99d6735bca91a83e8ce93da29c2fc03dc0dbc91e Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Thu, 18 Sep 2025 14:40:13 -0700 Subject: [PATCH 04/14] review: update examples for larger group size it was noted that if the group size is not larger than the wave size, the barrier option is redundant HLSL waves size can be at most 128. Hence, we initialize a group of 512 threads so that it is forced to evaluate over multiple waves. We also remove the divergent test case as this is not applicable due to it being undefined behaviour --- test/WaveOps/GroupMemoryBarrierWithSync.test | 99 ++++++++++---------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithSync.test b/test/WaveOps/GroupMemoryBarrierWithSync.test index 26a9e2bc5..88478fc5d 100644 --- a/test/WaveOps/GroupMemoryBarrierWithSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithSync.test @@ -1,48 +1,61 @@ #--- source.hlsl -StructuredBuffer In : register(t0); -RWStructuredBuffer Out : register(u1); +RWStructuredBuffer Out : register(u0); groupshared uint4 SharedData; +groupshared uint4 Indices[128]; -[numthreads(4,1,1)] -void main(uint3 tid : SV_GroupThreadID) { +// Note: Placing GroupMemoryBarrierWithGroupSync in divergent control branches +// is undefined, and hence, untested - // Basic broadcast - if (tid.x == 0) { - for (uint i = 0; i < 10; i++) { - SharedData = i * In[0]; - } +[numthreads(128,4,1)] +void main(uint3 ThreadID : SV_GroupThreadID) { + + // Basic Broadcast + if (ThreadID.x == 127 && ThreadID.y == 3) { + SharedData = 1; } + + // Prevents SharedData being read below before being initialized GroupMemoryBarrierWithGroupSync(); - Out[0][tid.x] = SharedData[tid.x]; - - // Divergent blocking - int offset = tid.x < 2 ? 0 : 2; - switch (offset) { - case 0: - Out[1][tid.x] = SharedData[tid.x]; - GroupMemoryBarrierWithGroupSync(); - break; - case 2: - Out[1][tid.x] = 2 * SharedData[tid.x]; - GroupMemoryBarrierWithGroupSync(); - break; + + if (ThreadID.x == 0) { + Out[0][ThreadID.y] = SharedData[ThreadID.y]; } - // Interlocked accumulation within for loop - for (uint i = 0; i < 4; i++) { - if (tid.x == i) { - SharedData[0] += In[0][tid.x]; - Out[2][tid.x] = SharedData[0]; + // Prevents SharedData being updated below before written to Out[0] + GroupMemoryBarrierWithGroupSync(); + + // Interlocked Accumulation + for (uint I = 0; I < 128; I++) { + if (ThreadID.x == I) { + SharedData[ThreadID.y] = SharedData[ThreadID.y] + 1; } + + // Prevents SharedData datarace across ThreadID.x, and, + // SharedData being written before fully accumulated GroupMemoryBarrierWithGroupSync(); } - // Strided writes - uint index = (tid.x * 3) % 4; - SharedData[tid.x] = In[0][index]; + if (ThreadID.x == 127) { + Out[1][ThreadID.y] = SharedData[ThreadID.y]; + } + + // Strided Read/Write: + Indices[ThreadID.x][ThreadID.y] = ThreadID.x + ThreadID.y * 128; + + // Prevents Indices from being read before initialized GroupMemoryBarrierWithGroupSync(); - Out[3][tid.x] = SharedData[tid.x]; + + uint2 MappedIdx = {(ThreadID.x * 37) % 128, (ThreadID.y * 3 % 4)}; + uint ToIdx = Indices[MappedIdx.x][MappedIdx.y]; + + if (ThreadID.x == 23) { // 23 is picked arbitrarily + // Expected mapping: + // x: 23 -> 83, y: 0 -> 0, 1 -> 3, 2 -> 2 + // + // Out[2] = [83, 83 + 3 * 128, 83 + 2 * 128, 83 + 128] + Out[2][ThreadID.y] = ToIdx; + } } //--- pipeline.yaml @@ -53,22 +66,17 @@ Shaders: Entry: main DispatchSize: [1, 1, 1] Buffers: - - Name: In - Format: UInt32 - Channels: 4 - Data: [ 1, 10, 100, 1000] - Name: Out Format: UInt32 Channels: 4 - ZeroInitSize: 64 + ZeroInitSize: 48 - Name: ExpectedOut Format: UInt32 Channels: 4 Data: [ - 9, 90, 900, 9000, # Broadcast - 9, 90, 1800, 18000, # Divergent - 10, 20, 120, 1120, # Accumulation - 1, 1000, 100, 10 # Strided Reorder + 1, 1, 1, 1, # Broadcast + 129, 129, 129, 129, # Accumulation + 83, 467, 339, 211, # Strided Read/Write ] Results: - Result: ExpectedOut @@ -77,20 +85,13 @@ Results: Expected: ExpectedOut DescriptorSets: - Resources: - - Name: In - Kind: StructuredBuffer - DirectXBinding: - Register: 0 - Space: 0 - VulkanBinding: - Binding: 0 - Name: Out Kind: RWStructuredBuffer DirectXBinding: - Register: 1 + Register: 0 Space: 0 VulkanBinding: - Binding: 1 + Binding: 0 ... #--- end From f8e9588b519764ea6d52bb6b2197a2285d0f4ba0 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Thu, 18 Sep 2025 14:47:05 -0700 Subject: [PATCH 05/14] review: update to xfail format Co-authored-by: Justin Bogner --- test/WaveOps/GroupMemoryBarrierWithSync.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithSync.test b/test/WaveOps/GroupMemoryBarrierWithSync.test index 88478fc5d..f5072091c 100644 --- a/test/WaveOps/GroupMemoryBarrierWithSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithSync.test @@ -96,8 +96,8 @@ DescriptorSets: #--- end # No lowering to DXIL Barrier/SPIRV OpMemoryBarrier -# Unsupported https://github.com/llvm/llvm-project/issues/99121 -# UNSUPPORTED: Clang +# Unimplemented https://github.com/llvm/llvm-project/issues/99121 +# XFAIL: Clang # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl From 1ac3cf36ca93ae3bdba383af06f0267e7e0bcdc4 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Thu, 18 Sep 2025 15:30:56 -0700 Subject: [PATCH 06/14] fix name --- ...yBarrierWithSync.test => GroupMemoryBarrierWithGroupSync.test} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/WaveOps/{GroupMemoryBarrierWithSync.test => GroupMemoryBarrierWithGroupSync.test} (100%) diff --git a/test/WaveOps/GroupMemoryBarrierWithSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test similarity index 100% rename from test/WaveOps/GroupMemoryBarrierWithSync.test rename to test/WaveOps/GroupMemoryBarrierWithGroupSync.test From 9d0429244aadaccdfede6b71e4afd20265aa7a1b Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Thu, 18 Sep 2025 15:34:01 -0700 Subject: [PATCH 07/14] remove xfail: had the incorrect barrier intrinsic --- test/WaveOps/GroupMemoryBarrierWithGroupSync.test | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index f5072091c..21f56112c 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -95,10 +95,6 @@ DescriptorSets: ... #--- end -# No lowering to DXIL Barrier/SPIRV OpMemoryBarrier -# Unimplemented https://github.com/llvm/llvm-project/issues/99121 -# XFAIL: Clang - # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o From 7e225868a25f6a496f941509d75f1342dd654a39 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Fri, 19 Sep 2025 09:34:06 -0700 Subject: [PATCH 08/14] review: simplify test to remove confusing example --- .../GroupMemoryBarrierWithGroupSync.test | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index 21f56112c..6e6736ae1 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -1,4 +1,5 @@ #--- source.hlsl + RWStructuredBuffer Out : register(u0); groupshared uint4 SharedData; @@ -39,23 +40,6 @@ void main(uint3 ThreadID : SV_GroupThreadID) { if (ThreadID.x == 127) { Out[1][ThreadID.y] = SharedData[ThreadID.y]; } - - // Strided Read/Write: - Indices[ThreadID.x][ThreadID.y] = ThreadID.x + ThreadID.y * 128; - - // Prevents Indices from being read before initialized - GroupMemoryBarrierWithGroupSync(); - - uint2 MappedIdx = {(ThreadID.x * 37) % 128, (ThreadID.y * 3 % 4)}; - uint ToIdx = Indices[MappedIdx.x][MappedIdx.y]; - - if (ThreadID.x == 23) { // 23 is picked arbitrarily - // Expected mapping: - // x: 23 -> 83, y: 0 -> 0, 1 -> 3, 2 -> 2 - // - // Out[2] = [83, 83 + 3 * 128, 83 + 2 * 128, 83 + 128] - Out[2][ThreadID.y] = ToIdx; - } } //--- pipeline.yaml @@ -69,14 +53,13 @@ Buffers: - Name: Out Format: UInt32 Channels: 4 - ZeroInitSize: 48 + ZeroInitSize: 32 - Name: ExpectedOut Format: UInt32 Channels: 4 Data: [ 1, 1, 1, 1, # Broadcast 129, 129, 129, 129, # Accumulation - 83, 467, 339, 211, # Strided Read/Write ] Results: - Result: ExpectedOut From 8820f927603567d431e9f1008f48b5ee15112aee Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Fri, 19 Sep 2025 10:14:31 -0700 Subject: [PATCH 09/14] add metal failure --- test/WaveOps/GroupMemoryBarrierWithGroupSync.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index 6e6736ae1..89cce393c 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -78,6 +78,9 @@ DescriptorSets: ... #--- end +# Bug: https://github.com/llvm/offload-test-suite/issues/444 +# XFAIL: Metal + # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o From f4bef335fbe9eb145361725894a117aecf34a77a Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Fri, 19 Sep 2025 13:25:08 -0700 Subject: [PATCH 10/14] add intel failures --- test/WaveOps/GroupMemoryBarrierWithGroupSync.test | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index 89cce393c..9f16a9923 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -81,6 +81,10 @@ DescriptorSets: # Bug: https://github.com/llvm/offload-test-suite/issues/444 # XFAIL: Metal +# Bug https://github.com/llvm/offload-test-suite/issues/445 +# XFAIL: DirectX-Intel +# XFAIL: Vulkan-Intel + # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o From 72f6663ef416ab656d360be2f4428c7eb8bf1f45 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Mon, 22 Sep 2025 12:35:46 -0700 Subject: [PATCH 11/14] bump run onto clang fix --- test/WaveOps/GroupMemoryBarrierWithGroupSync.test | 1 - 1 file changed, 1 deletion(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index 9f16a9923..aecf6d9c1 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -43,7 +43,6 @@ void main(uint3 ThreadID : SV_GroupThreadID) { } //--- pipeline.yaml - --- Shaders: - Stage: Compute From 05d9c7b7d10736c0682b093e7c07924c0aeeab5d Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Mon, 22 Sep 2025 13:58:11 -0700 Subject: [PATCH 12/14] remove XFAIL for vulkan. is specific to d3d12 --- test/WaveOps/GroupMemoryBarrierWithGroupSync.test | 1 - 1 file changed, 1 deletion(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index aecf6d9c1..f54793e41 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -82,7 +82,6 @@ DescriptorSets: # Bug https://github.com/llvm/offload-test-suite/issues/445 # XFAIL: DirectX-Intel -# XFAIL: Vulkan-Intel # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl From efab0f44846079a9a648c3f4309404c8197fcd31 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Mon, 22 Sep 2025 15:12:33 -0700 Subject: [PATCH 13/14] add clang xfail --- test/WaveOps/GroupMemoryBarrierWithGroupSync.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index f54793e41..8679cae95 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -83,6 +83,9 @@ DescriptorSets: # Bug https://github.com/llvm/offload-test-suite/issues/445 # XFAIL: DirectX-Intel +# Bug https://github.com/llvm/llvm-project/issues/160208 +# XFAIL: Clang + # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o From a08c53e757d072e7f61209b0a46c703a57f902a3 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Tue, 23 Sep 2025 10:41:33 -0700 Subject: [PATCH 14/14] unmark warp for datarace bug --- test/WaveOps/GroupMemoryBarrierWithGroupSync.test | 3 ++- test/lit.cfg.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test index 8679cae95..fb0422f34 100644 --- a/test/WaveOps/GroupMemoryBarrierWithGroupSync.test +++ b/test/WaveOps/GroupMemoryBarrierWithGroupSync.test @@ -83,8 +83,9 @@ DescriptorSets: # Bug https://github.com/llvm/offload-test-suite/issues/445 # XFAIL: DirectX-Intel +# The data-race is not observed on WARP # Bug https://github.com/llvm/llvm-project/issues/160208 -# XFAIL: Clang +# XFAIL: Clang && !WARP # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 6c0fb7dcb..54f901857 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -63,6 +63,7 @@ def setDeviceFeatures(config, device, compiler): if "Microsoft Basic Render Driver" in device["Description"]: config.available_features.add("%s-WARP" % API) config.available_features.add("WARP-%s" % config.warp_arch) + config.available_features.add("WARP") if "Intel" in device["Description"]: config.available_features.add("%s-Intel" % API) if "UHD Graphics" in device["Description"] and API == "DirectX":