From 0d2a9960585bf3d06dd028ee3fbfc311f52ed9a8 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 14 Nov 2025 17:18:06 -0800 Subject: [PATCH] [AMDGPU] Add baseline test to show spilling of wmma scale. NFC This is to show the spilling of WMMA scale values which are limited to low 256 VGPRs. We have free registers, just RA allocates low 256 first. --- .../AMDGPU/regalloc-spill-wmma-scale.ll | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll b/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll new file mode 100644 index 0000000000000..1ac3da3b930f9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll @@ -0,0 +1,131 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s + +; FIXME: Scale operands of WMMA are limited to low 256 VGPRs +; currently we are spilling it because all low VGPRs are occupied even though our budget is higher. +; Make sure we do not spill scale operands because of the low 256 restriction. +; CHECK: ; ScratchSize: 12 +; CHECK: ; Occupancy: 1 + +define amdgpu_kernel void @spill_scale_test(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <16 x i32> %arg8, float %arg9, <16 x i32> %arg10, float %arg11, <16 x i8> %arg12) #0 { +bb: + %i = shufflevector <16 x i8> %arg12, <16 x i8> zeroinitializer, <64 x i32> + tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) null, ptr addrspace(3) null, i32 0, i32 0) + %i13 = bitcast <64 x i8> %i to <16 x i32> + tail call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) null, ptr addrspace(3) null, i32 0, i32 0) + %i14 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) null) + %i15 = bitcast <2 x i32> %i14 to <8 x i8> + %i16 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) null) + %i17 = shufflevector <8 x i8> %i15, <8 x i8> zeroinitializer, <64 x i32> + %i18 = shufflevector <64 x i8> zeroinitializer, <64 x i8> %i17, <64 x i32> + %i19 = insertelement <64 x i8> %i18, i8 0, i64 57 + %i20 = bitcast <64 x i8> %i19 to <16 x i32> + %.extract2214 = extractelement <2 x i32> %i16, i64 0 + %i21 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %i20, i32 0, <16 x i32> %i13, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %i22 = extractelement <8 x float> %i21, i64 0 + %i23 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> zeroinitializer, i32 0, <16 x i32> zeroinitializer, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 %.extract2214, i32 0, i32 0, i32 0, i1 false, i1 false) + %i24 = extractelement <8 x float> %i23, i64 0 + %i25 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %arg8, i32 0, <16 x i32> zeroinitializer, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %i26 = extractelement <8 x float> %i25, i64 0 + %i27 = fmul float %i22, 0.000000e+00 + %i28 = fmul float %i24, 0.000000e+00 + %i29 = insertelement <2 x float> zeroinitializer, float %i26, i64 1 + %i30 = insertelement <2 x float> zeroinitializer, float %i28, i64 0 + %i31 = insertelement <2 x float> zeroinitializer, float %arg11, i64 0 + %i32 = fadd <2 x float> %i31, %i30 + %i33 = insertelement <2 x float> zeroinitializer, float %arg9, i64 0 + %i34 = fadd <2 x float> %i33, %i32 + %i35 = insertelement <2 x float> zeroinitializer, float %arg7, i64 0 + %i36 = fadd <2 x float> %i35, %i34 + %i37 = insertelement <2 x float> zeroinitializer, float %arg1, i64 0 + %i38 = fadd <2 x float> %i37, %i36 + %i39 = insertelement <2 x float> zeroinitializer, float %arg6, i64 0 + %i40 = fadd <2 x float> %i39, %i38 + %i41 = insertelement <2 x float> zeroinitializer, float %arg4, i64 0 + %i42 = fadd <2 x float> %i41, %i40 + %i43 = insertelement <2 x float> zeroinitializer, float %arg5, i64 0 + %i44 = fadd <2 x float> %i43, %i42 + %i45 = insertelement <2 x float> zeroinitializer, float %arg3, i64 0 + %i46 = fadd <2 x float> %i45, %i44 + %i47 = insertelement <2 x float> zeroinitializer, float %arg, i64 0 + %i48 = insertelement <2 x float> zeroinitializer, float %arg2, i64 0 + %i49 = fadd <2 x float> %i48, %i46 + %i50 = fadd <2 x float> %i29, %i49 + %i51 = fadd <2 x float> %i47, %i50 + %i52 = insertelement <8 x float> zeroinitializer, float %i27, i64 0 + %i53 = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %i52, float 0.000000e+00) + %i54 = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> splat (float 0x7FF8000000000000), float 0.000000e+00) + %i55 = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> splat (float 1.000000e+00), float 0.000000e+00) + %.extract1415 = extractelement <2 x i32> %i53, i64 0 + %.extract1416 = extractelement <2 x i32> %i54, i64 0 + %.extract1424 = extractelement <2 x i32> %i55, i64 0 + %i56 = tail call i32 @llvm.amdgcn.ds.swizzle(i32 %.extract1416, i32 0) + %i57 = bitcast i32 %.extract1415 to <4 x i8> + %i58 = shufflevector <4 x i8> %i57, <4 x i8> zeroinitializer, <64 x i32> + %i59 = bitcast i32 %i56 to <4 x i8> + %i60 = bitcast i32 %.extract1424 to <4 x i8> + %i61 = shufflevector <4 x i8> %i60, <4 x i8> zeroinitializer, <64 x i32> + %i62 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) null) + %i63 = bitcast <2 x i32> %i62 to <8 x i8> + %i64 = shufflevector <8 x i8> %i63, <8 x i8> zeroinitializer, <64 x i32> + %i65 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) null) + %i66 = bitcast <2 x i32> %i65 to <8 x i8> + %i67 = shufflevector <8 x i8> %i66, <8 x i8> zeroinitializer, <64 x i32> + %i68 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) null) + %i69 = bitcast <2 x i32> %i68 to <8 x i8> + %i70 = shufflevector <8 x i8> %i69, <8 x i8> zeroinitializer, <64 x i32> + %i71 = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) getelementptr (i8, ptr addrspace(3) null, i32 75232)) + %i72 = shufflevector <64 x i8> zeroinitializer, <64 x i8> %i58, <64 x i32> + %i73 = bitcast <64 x i8> %i72 to <16 x i32> + %i74 = shufflevector <4 x i8> %i59, <4 x i8> zeroinitializer, <64 x i32> + %i75 = shufflevector <64 x i8> %i74, <64 x i8> %i61, <64 x i32> + %i76 = bitcast <64 x i8> %i75 to <16 x i32> + %i77 = shufflevector <64 x i8> zeroinitializer, <64 x i8> %i64, <64 x i32> + %i78 = bitcast <64 x i8> %i77 to <16 x i32> + %i79 = bitcast <64 x i8> %i67 to <16 x i32> + %i80 = shufflevector <64 x i8> zeroinitializer, <64 x i8> %i70, <64 x i32> + %i81 = bitcast <64 x i8> %i80 to <16 x i32> + %.extract1434 = extractelement <2 x i32> %i71, i64 0 + %i82 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %i78, i32 0, <16 x i32> zeroinitializer, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %i83 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %arg10, i32 0, <16 x i32> %i73, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %i84 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %i79, i32 0, <16 x i32> %i73, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2139062143, i1 false, i1 false) + %i85 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %i81, i32 0, <16 x i32> %arg8, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2139062143, i1 false, i1 false) + %i86 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> splat (i32 16843009), i32 0, <16 x i32> %arg10, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %i87 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> zeroinitializer, i32 0, <16 x i32> zeroinitializer, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false) + %i88 = tail call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> splat (i32 1), i32 0, <16 x i32> %i76, i16 0, <8 x float> zeroinitializer, i32 0, i32 0, i32 %.extract1434, i32 0, i32 0, i32 0, i1 false, i1 false) + %i89 = fdiv <8 x float> %i82, zeroinitializer + %i90 = fcmp uno <8 x float> %i89, zeroinitializer + %i91 = select <8 x i1> %i90, <8 x bfloat> splat (bfloat 0xR3F80), <8 x bfloat> zeroinitializer + %i92 = bitcast <8 x bfloat> %i91 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %i92, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %i93 = fdiv <8 x float> %i83, zeroinitializer + %i94 = fcmp uno <8 x float> %i93, zeroinitializer + %i95 = select <8 x i1> %i94, <8 x bfloat> splat (bfloat 0xR3F80), <8 x bfloat> zeroinitializer + %i96 = bitcast <8 x bfloat> %i95 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %i96, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %i97 = fcmp uno <8 x float> %i84, zeroinitializer + %i98 = select <8 x i1> %i97, <8 x bfloat> splat (bfloat 0xR3F80), <8 x bfloat> zeroinitializer + %i99 = bitcast <8 x bfloat> %i98 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %i99, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %i100 = fcmp uno <8 x float> %i85, zeroinitializer + %i101 = select <8 x i1> %i100, <8 x bfloat> splat (bfloat 0xR3F80), <8 x bfloat> zeroinitializer + %i102 = bitcast <8 x bfloat> %i101 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %i102, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %i103 = fcmp uno <8 x float> %i86, zeroinitializer + %i104 = select <8 x i1> %i103, <8 x bfloat> splat (bfloat 0xR3F80), <8 x bfloat> zeroinitializer + %i105 = bitcast <8 x bfloat> %i104 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %i105, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %i106 = fcmp uno <8 x float> %i87, zeroinitializer + %i107 = select <8 x i1> %i106, <8 x bfloat> splat (bfloat 0xR3F80), <8 x bfloat> zeroinitializer + %i108 = bitcast <8 x bfloat> %i107 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %i108, ptr addrspace(8) null, i32 0, i32 0, i32 0) + %i109 = shufflevector <2 x float> %i51, <2 x float> zeroinitializer, <4 x i32> + %i110 = shufflevector <4 x float> %i109, <4 x float> zeroinitializer, <8 x i32> + %i111 = fmul <8 x float> %i88, %i110 + %i112 = fcmp uno <8 x float> %i111, zeroinitializer + %i113 = select <8 x i1> %i112, <8 x bfloat> splat (bfloat 0xR3F80), <8 x bfloat> zeroinitializer + %i114 = bitcast <8 x bfloat> %i113 to <4 x i32> + tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %i114, ptr addrspace(8) null, i32 0, i32 0, i32 0) + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,128" "amdgpu-waves-per-eu"="1,1" }