405 changes: 405 additions & 0 deletions llvm/test/CodeGen/AMDGPU/permute_i8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2883,6 +2883,411 @@ define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %i
ret void
}


declare i16 @llvm.fshr.i16(i16, i16, i16)

define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshri16_8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshri16_8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x30407
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshri16_16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshri16_16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x3020706
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshri16_24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshri16_24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x30407
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshri16_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshri16_32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x3020706
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshri16_88:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshri16_88:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x30407
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

declare i16 @llvm.fshl.i16(i16, i16, i16)

define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshli16_1347:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshli16_1347:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x30407
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshli16_16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshli16_16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x1000504
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshli16_24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshli16_24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x30407
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshli16_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshli16_32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x1000504
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
; GFX10-LABEL: fshli16_88:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v6, v[0:1], off
; GFX10-NEXT: global_load_dword v7, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
; GFX10-NEXT: global_store_dword v[4:5], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fshli16_88:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v6, v[0:1], off
; GFX9-NEXT: global_load_dword v7, v[2:3], off
; GFX9-NEXT: s_mov_b32 s4, 0x30407
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
%vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
%v1e0 = extractelement <2 x i16> %vec1, i64 0
%v1e1 = extractelement <2 x i16> %vec1, i64 1
%v2e0 = extractelement <2 x i16> %vec2, i64 0
%v2e1 = extractelement <2 x i16> %vec2, i64 1

%tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88)
%byte01 = zext i16 %tmp01.0 to i32

%tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88)
%tmp23.1 = zext i16 %tmp23.0 to i32
%byte23 = shl i32 %tmp23.1, 16
%res = or i32 %byte01, %byte23
store i32 %res, ptr addrspace(1) %out0, align 4
ret void
}

define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
; GFX10-LABEL: shlbase:
; GFX10: ; %bb.0:
Expand Down