diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll index f504f2caa8632..3e96dfe40f745 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -158,3 +158,69 @@ entry: store <4 x bfloat> %val, ptr addrspace(1) %use ret void } + +; This is a special case that does not require aligned VGPRs. Make +; sure no copies are required for the unaligned ABI return value. +define { i32, <3 x i32> } @ds_read_b96_tr_b6_no_align2_requirement(ptr addrspace(3) %ptr) { +; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v4 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v4 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep) + %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0 + %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1 + ret { i32, <3 x i32> } %insert1 +} + +define void @ds_read_b96_tr_b6_no_align2_requirement_agpr(ptr addrspace(3) %ptr) { +; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, v1 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX950-SDAG-NEXT: ;;#ASMSTART +; GFX950-SDAG-NEXT: ; use a1 a2 a3 +; GFX950-SDAG-NEXT: ;;#ASMEND +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, v1 +; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX950-GISEL-NEXT: ;;#ASMSTART +; GFX950-GISEL-NEXT: ; use a1 a2 a3 +; GFX950-GISEL-NEXT: ;;#ASMEND +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep) + %val0 = extractelement <3 x i32> %val, i32 0 + %val1 = extractelement <3 x i32> %val, i32 1 + %val2 = extractelement <3 x i32> %val, i32 2 + call void asm sideeffect "; use $0 $1 $2", "{a1},{a2},{a3}"(i32 %val0, i32 %val1, i32 %val2) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll index d91b03ca4461d..d9f2fc55709a6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll @@ -320,3 +320,57 @@ entry: store <8 x bfloat> %val, ptr addrspace(1) %use ret void } + +; This is a special case that does not require aligned VGPRs. Make +; sure no copies are required for the unaligned ABI return value. +define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr6_b96_vaddr_no_align2_requirement: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2 +; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep) + %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0 + %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1 + ret { i32, <3 x i32> } %insert1 +} + +define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr6_b96_saddr_no_align2_requirement: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2 +; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep) + %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0 + %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1 + ret { i32, <3 x i32> } %insert1 +} + +define { i32, <3 x i32> } @ds_load_tr6_b96_no_align2_requirement(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: ds_load_tr6_b96_no_align2_requirement: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ds_load_tr6_b96 v[2:4], v0 offset:32 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2 +; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep) + %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0 + %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1 + ret { i32, <3 x i32> } %insert1 +}