Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Precommit test for D159533 #66965

Merged
merged 1 commit into from
Sep 21, 2023
Merged

Conversation

jrbyrnes
Copy link
Contributor

Precommit test ahead of https://reviews.llvm.org/D159533 for ISD::FSHR / AMDGPUISD::PERM combine

Change-Id: Ic9f7c1f2e221b8715f4f25d5b64cb73c15eca3a5
@llvmbot
Copy link
Collaborator

llvmbot commented Sep 21, 2023

@llvm/pr-subscribers-backend-amdgpu

Changes

Precommit test ahead of https://reviews.llvm.org/D159533 for ISD::FSHR / AMDGPUISD::PERM combine


Full diff: https://github.com/llvm/llvm-project/pull/66965.diff

1 Files Affected:

  • (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+405)
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index c71f69edc76fa6e..1d139f1fb40c281 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -2883,6 +2883,411 @@ define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %i
   ret void
 }
 
+
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+
+define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x30407
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x30407
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshri16_88:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshri16_88:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x30407
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+
+define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_1347:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_1347:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x30407
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_24:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x30407
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
+define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
+; GFX10-LABEL: fshli16_88:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fshli16_88:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x30407
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
+  %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
+  %v1e0 = extractelement <2 x i16> %vec1, i64 0
+  %v1e1 = extractelement <2 x i16> %vec1, i64 1
+  %v2e0 = extractelement <2 x i16> %vec2, i64 0
+  %v2e1 = extractelement <2 x i16> %vec2, i64 1
+
+  %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88)
+  %byte01 = zext i16 %tmp01.0 to i32
+
+  %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88)
+  %tmp23.1 = zext i16 %tmp23.0 to i32
+  %byte23 = shl i32 %tmp23.1, 16
+  %res = or i32 %byte01, %byte23
+  store i32 %res, ptr addrspace(1) %out0, align 4
+  ret void
+}
+
 define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
 ; GFX10-LABEL: shlbase:
 ; GFX10:       ; %bb.0:

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@RKSimon RKSimon merged commit acb4854 into llvm:main Sep 21, 2023
3 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants